Esempio n. 1
0
    def __init__(self, args, display_title=None):
        Step.__init__(self, args, display_title=display_title)

        # set
        self.match_barcode_list, self.n_cell = utils.read_barcode_file(args.match_dir)
        self.match_barcode = set(self.match_barcode_list)

        if args.panel:
            self.gene_list = utils.get_gene_region_from_bed(args.panel)[0]
            self.n_gene = len(self.gene_list)
        else:
            self.gene_list, self.n_gene = utils.read_one_col(args.gene_list)

        if not self.gene_list:
            sys.exit("You must provide either --panel or --gene_list!")

        self.count_dict = utils.genDict(dim=3, valType=int)

        self.add_metric(
            name="Number of Target Genes",
            value=self.n_gene,
        )
        self.add_metric(
            name="Number of Cells",
            value=self.n_cell,
        )

        # out file
        self.out_bam_file = f'{self.out_prefix}_filtered.bam'
        self.out_bam_file_sorted = f'{self.out_prefix}_filtered_sorted.bam'
Esempio n. 2
0
    def __init__(self, args, display_title=None):
        Step.__init__(self, args, display_title=display_title)

        # set
        self.chains = CHAINS[args.type]
        self.cols = []
        for chain in self.chains:
            for seq in SEQUENCES_HEADER:
                self.cols.append("_".join([seq, chain]))

        self.match_bool = False
        if args.match_dir and args.match_dir.strip() != 'None':
            self.match_cell_barcodes, _match_cell_number = utils.read_barcode_file(
                args.match_dir)
            self.match_bool = True
        elif args.matrix_dir and args.matrix_dir.strip() != 'None':
            self.match_cell_barcodes = utils.get_barcodes_from_matrix_dir(
                args.matrix_dir)
            self.match_bool = True
        if self.match_bool:
            self.match_cell_barcodes = set(self.match_cell_barcodes)

        # out files
        self.cell_confident_file = f"{self.out_prefix}_cell_confident.tsv"
        self.cell_confident_count_file = f"{self.out_prefix}_cell_confident_count.tsv"
        self.clonetypes_file = f"{self.out_prefix}_clonetypes.tsv"
        self.match_clonetypes_file = f"{self.out_prefix}_match_clonetypes.tsv"

        # add args data
        self.add_data(iUMI=args.iUMI)
Esempio n. 3
0
    def __init__(self, args, display_title=None):
        Step.__init__(self, args, display_title=display_title)

        # read args
        self.fq = args.fq
        self.fq_pattern = args.fq_pattern
        self.linker_fasta = args.linker_fasta
        self.barcode_fasta = args.barcode_fasta

        # process
        self.barcode_dict, self.barcode_length = utils.read_fasta(self.barcode_fasta, equal=True)
        if self.linker_fasta and self.linker_fasta != 'None':
            self.linker_dict, self.linker_length = utils.read_fasta(self.linker_fasta, equal=True)
        else:
            self.linker_dict, self.linker_length = {}, 0
        self.pattern_dict = parse_pattern(self.fq_pattern)

        # check barcode length
        barcode1 = self.pattern_dict["C"][0]
        # end - start
        pattern_barcode_length = barcode1[1] - barcode1[0]
        if pattern_barcode_length != self.barcode_length:
            raise Exception(
                f'''barcode fasta length {self.barcode_length} 
                != pattern barcode length {pattern_barcode_length}'''
            )

        self.res_dic = utils.genDict()
        self.res_sum_dic = utils.genDict(dim=2)
        self.match_barcode = []

        # out files
        self.read_count_file = f'{self.outdir}/{self.sample}_read_count.tsv'
        self.UMI_count_file = f'{self.outdir}/{self.sample}_UMI_count.tsv'
        self.stat_file = f'{self.outdir}/stat.txt'
Esempio n. 4
0
    def __init__(self, args):
        Step.__init__(self, args)

        # input files
        self.sample = args.sample
        self.bam_file = args.bam
        self.outdir = args.outdir

        # output files
        self.outstat = os.path.join(self.outdir, self.sample+'.substitution.txt')
Esempio n. 5
0
    def __init__(self, args, display_title=None):
        Step.__init__(self, args, display_title=display_title)

        # set
        self.gtf = Mkref_rna.parse_genomeDir(self.args.genomeDir)['gtf']
        self.featureCounts_param = args.featureCounts_param

        # out files
        input_basename = os.path.basename(self.args.input)
        self.featureCounts_bam = f'{self.outdir}/{input_basename}.featureCounts.bam'
        self.name_sorted_bam = f'{self.out_prefix}_name_sorted.bam'
        self.featureCount_log_file = f'{self.out_prefix}.summary'
Esempio n. 6
0
    def __init__(self, args, display_title=None):
        Step.__init__(self, args, display_title=display_title)
        if not (args.split_matrix or args.split_fastq or args.split_vdj):
            return

        # set
        df_umi_tag = pd.read_csv(args.umi_tag_file, sep='\t', index_col=0)
        df_umi_tag = df_umi_tag.rename_axis('barcode').reset_index()
        self.tag_barcode_dict = {
            tag: set(row["barcode"].tolist())
            for tag, row in df_umi_tag.groupby("tag")
        }

        if args.split_matrix:
            self.matrix_outdir = f'{args.outdir}/matrix/'
            if args.match_dir:
                matrix_10X_dir = glob.glob(
                    f'{args.match_dir}/05.count/*_matrix_10X*')[0]
            elif args.matrix_dir:
                matrix_10X_dir = args.matrix_dir
            else:
                raise ValueError("--match_dir or --matrix_dir is required.")
            self.raw_mat, self.raw_features_path, self.raw_barcodes = read_raw_matrix(
                matrix_10X_dir)

        if args.split_fastq:
            self.rna_fq_file = glob.glob(
                f'{args.match_dir}/*barcode/*_2.fq*')[0]

            fastq_outdir = f'{args.outdir}/fastqs/'
            os.system(f'mkdir -p {fastq_outdir}')

            self.r2_fastq_files_handle = {}
            self.r1_fastq_files_handle = {}
            for tag in self.tag_barcode_dict:
                r2_fastq_file_name = f'{fastq_outdir}/{tag}_2.fq'
                self.r2_fastq_files_handle[tag] = open(r2_fastq_file_name, 'w')
                r1_fastq_file_name = f'{fastq_outdir}/{tag}_1.fq'
                self.r1_fastq_files_handle[tag] = open(r1_fastq_file_name, 'w')

            self.tag_read_index_dict = defaultdict(set)

        if args.split_vdj:
            self.cell_confident_vdj = glob.glob(
                f'{args.vdj_dir}/*count_vdj/*cell_confident.tsv*')[0]

            self.vdj_outdir = f'{args.outdir}/vdj/'
            if not os.path.exists(self.vdj_outdir):
                os.system(f'mkdir -p {self.vdj_outdir}')
Esempio n. 7
0
    def __init__(self, args):
        Step.__init__(self, args)

        # input files
        self.outdir = args.outdir
        self.sample = args.sample
        self.bam_file = args.bam
        self.snp_file = args.bg
        self.bg_cov = args.bg_cov
        self.cell_keep = args.cell_keep
        # output files
        self.outread = os.path.join(
            self.outdir, self.sample + '.corrected_gene_cell_UMI_read.txt')
        self.outrds = os.path.join(self.outdir, self.sample + '.TC_matrix.rds')
        self.outpre = os.path.join(self.outdir, self.sample)
Esempio n. 8
0
    def __init__(self, args):
        Step.__init__(self, args)

        # input files
        self.sample = args.sample
        self.tsnefile = args.tsne
        self.matfile = args.mat
        self.repfile = args.rep
        self.mincell = args.mincell
        self.topgene = args.topgene
        # output files
        self.outdot = os.path.join(self.outdir,
                                   self.sample + '.rep_in_tsne.txt')
        self.outtbl = os.path.join(self.outdir,
                                   self.sample + '.rep_in_tsne_top10.txt')
Esempio n. 9
0
    def test_stat_to_metric(self):
        os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/multi_tests/rna')
        args_dict = {
            'sample': 'test1',
            'assay': 'rna',
            'thread': 1,
            'outdir': 'test1/06.analysis',
            'debug': True,
        }
        Args = namedtuple('Args', list(args_dict.keys()))
        args = Args(**args_dict)

        obj = Step(args, 'analysis')
        obj.stat_to_metric()
        print(obj.__content_dict['metric'])
Esempio n. 10
0
    def __init__(self, args):
        Step.__init__(self, args)
        # input files
        self.ifile = os.path.join(args.outdir, args.sample + '.bam')
        self.sample = args.sample
        self.strandednessfile = args.strand
        self.inbam = args.bam
        self.bcfile = args.cell
        self.outdir = args.outdir
        self.thread = args.thread

        # output files
        self.outfile_bam = os.path.join(args.outdir,
                                        args.sample + '.PosTag.bam')
        self.outfile_csv = os.path.join(args.outdir,
                                        args.sample + '.PosTag.csv')
Esempio n. 11
0
    def __init__(self, args, display_title=None):
        Step.__init__(self, args, display_title=display_title)

        # set
        self.read_type = "UMIs"
        if args.not_consensus:
            self.read_type = 'Reads'
        self.chains = CHAINS[args.type]

        # out files
        self.UMI_count_unfiltered_file = f'{self.out_prefix}_UMI_count_unfiltered.tsv'
        self.UMI_count_filtered_file = f'{self.out_prefix}_UMI_count_filtered.tsv'
        self.mixcr_report = f"{self.out_prefix}_align.txt"
        self.not_align_fq = f"{self.out_prefix}_not_align.fq"
        self.read2_vdjca = f"{self.out_prefix}_read2.vdjca"
        self.alignments = f"{self.out_prefix}_alignments.txt"
Esempio n. 12
0
    def __init__(self, args, display_title=None):
        Step.__init__(self, args, display_title=display_title)

        self.fq1_list = args.fq1.split(",")
        self.fq2_list = args.fq2.split(",")
        self.fq_number = len(self.fq1_list)
        if self.fq_number != len(self.fq2_list):
            raise Exception('fastq1 and fastq2 do not have same file number!')
        if args.chemistry == 'auto':
            ch = Chemistry(args.fq1)
            self.chemistry_list = ch.check_chemistry()
        else:
            self.chemistry_list = [args.chemistry] * self.fq_number
        self.barcode_corrected_num = 0
        self.linker_corrected_num = 0
        self.total_num = 0
        self.clean_num = 0
        self.no_polyT_num = 0
        self.lowQual_num = 0
        self.no_linker_num = 0
        self.no_barcode_num = 0
        self.barcode_qual_Counter = Counter()
        self.umi_qual_Counter = Counter()
        self.pattern = args.pattern
        self.linker = args.linker
        self.whitelist = args.whitelist
        self.lowNum = args.lowNum
        self.lowQual = args.lowQual
        self.allowNoPolyT = args.allowNoPolyT
        self.allowNoLinker = args.allowNoLinker
        self.nopolyT = args.nopolyT  # true == output nopolyT reads
        self.noLinker = args.noLinker
        self.output_R1 = args.output_R1

        # out file
        if args.gzip:
            suffix = ".gz"
        else:
            suffix = ""
        self.out_fq2 = f'{self.out_prefix}_2.fq{suffix}'
        self.out_fq1 = f'{self.out_prefix}_1.fq{suffix}'
        if self.nopolyT:
            self.nopolyT_1 = f'{self.out_prefix}_noPolyT_1.fq'
            self.nopolyT_2 = f'{self.out_prefix}_noPolyT_2.fq'
        if self.noLinker:
            self.noLinker_1 = f'{self.out_prefix}_noLinker_1.fq'
            self.noLinker_2 = f'{self.out_prefix}_noLinker_2.fq'
Esempio n. 13
0
    def __init__(self, args, step_name):
        Step.__init__(self, args, step_name)

        self.outdir = args.outdir
        self.sample = args.sample
        self.Seqtype = args.Seqtype
        self.all_rep = args.all_rep
        self.fa = args.fa

        if self.Seqtype == 'TCR':
            self.string = 't'
            self.chain = ['TRA', 'TRB']
            self.paired_groups = ['TRA_TRB']
        elif self.Seqtype == 'BCR':
            self.string = 'b'
            self.chain = ['IGH', 'IGL', 'IGK']
            self.paired_groups = ['IGH_IGL', 'IGH_IGK']
Esempio n. 14
0
    def __init__(self, args):
        Step.__init__(self, args)

        # set
        self.barcodes, _num = utils.read_barcode_file(args.match_dir)
        self.fasta = Mkref_rna.parse_genomeDir(args.genomeDir)['fasta']
        self.df_vcf = None
        self.panel = args.panel
        self.bed = utils.get_bed_file_path(self.panel)

        # out
        self.splitN_bam = f'{self.out_prefix}_splitN.bam'
        self.splitN_bam_name_sorted = f'{self.out_prefix}_splitN_name_sorted.bam'

        self.raw_bcf_file = f'{self.out_prefix}_raw.bcf'
        self.raw_vcf_file = f'{self.out_prefix}_raw.vcf'
        self.fixed_header_vcf = f'{self.out_prefix}_fixed.vcf'
        self.norm_vcf_file = f'{self.out_prefix}_norm.vcf'
Esempio n. 15
0
    def __init__(self, args, display_title=None):
        Step.__init__(self, args, display_title=display_title)
        self.force_cell_num = args.force_cell_num
        self.cell_calling_method = args.cell_calling_method
        self.expected_cell_num = int(args.expected_cell_num)
        self.bam = args.bam

        # set
        self.gtf_file = Mkref_rna.parse_genomeDir(args.genomeDir)['gtf']
        self.gtf_dict = utils.Gtf_dict(self.gtf_file)
        self.downsample_dict = {}

        # output files
        self.count_detail_file = f'{self.outdir}/{self.sample}_count_detail.txt'
        self.marked_count_file = f'{self.outdir}/{self.sample}_counts.txt'
        self.raw_matrix_10X_dir = f'{self.outdir}/{self.sample}_all_matrix'
        self.cell_matrix_10X_dir = f'{self.outdir}/{self.sample}_matrix_10X'
        self.downsample_file = f'{self.outdir}/{self.sample}_downsample.txt'
Esempio n. 16
0
    def __init__(self, args, display_title=None):
        Step.__init__(self, args, display_title=display_title)
        self.read_count_file = args.read_count_file
        self.UMI_min = args.UMI_min
        self.SNR_min = args.SNR_min
        self.combine_cluster = args.combine_cluster
        self.dim = int(args.dim)
        self.coefficient = float(args.coefficient)

        # read
        self.df_read_count = pd.read_csv(self.read_count_file,
                                         sep="\t",
                                         index_col=0)

        if args.match_dir:
            match_dict = utils.parse_match_dir(args.match_dir)
            self.match_barcode = match_dict['match_barcode']
            self.n_match_barcode = match_dict['n_match_barcode']
            self.tsne_file = match_dict['tsne_coord']
            self.matrix_dir = match_dict['matrix_dir']
        elif args.matrix_dir:
            df_barcode = pd.read_csv(f'{args.matrix_dir}/barcodes.tsv',
                                     header=None)
            self.match_barcode = df_barcode[0].tolist()
            self.n_match_barcode = len(self.match_barcode)
            self.tsne_file = args.tsne_file
            self.matrix_dir = args.matrix_dir
        else:
            raise ValueError("--match_dir or --matrix_dir is required.")

        # init
        self.no_noise = False

        # out files
        self.UMI_tag_file = f'{self.outdir}/{self.sample}_umi_tag.tsv'
        self.tsne_tag_file = f'{self.outdir}/{self.sample}_tsne_tag.tsv'
        self.cluster_count_file = f'{self.outdir}/{self.sample}_cluster_count.tsv'
        self.cluster_plot = f'{self.outdir}/{self.sample}_cluster_plot.pdf'
        if self.combine_cluster:
            self.combine_cluster_count_file = f'{self.outdir}/{self.sample}_combine_cluster_count.tsv'
            self.combine_cluster_plot = f'{self.outdir}/{self.sample}_combine_cluster_plot.pdf'
Esempio n. 17
0
 def __init__(self, args):
     Step.__init__(self, args)
     self.assay_description = utils.get_assay_text(self.assay)
     self.version = __VERSION__
     self.chemistry = args.chemistry