Beispiel #1
0
 def __init__(
     self,
     sample,
     outdir,
     assay,
     read_count_file,
     match_dir,
     UMI_min,
     SNR_min,
     combine_cluster,
     dim,
     ):
     self.sample = sample
     self.outdir = outdir
     self.assay = assay
     self.read_count_file = read_count_file
     self.match_dir = match_dir
     self.UMI_min = UMI_min
     self.SNR_min = SNR_min
     self.combine_cluster = combine_cluster
     self.dim = int(dim)
     self.match_barcode, self.cell_total = read_barcode_file(match_dir)
     self.df_read_count = pd.read_csv(read_count_file, sep="\t", index_col=0)
     self.tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0]
     self.no_noise = False
     if not os.path.exists(outdir):
         os.system('mkdir -p %s' % outdir)
    def __init__(self, args, display_title=None):
        Step.__init__(self, args, display_title=display_title)

        # set
        self.match_barcode_list, self.n_cell = utils.read_barcode_file(args.match_dir)
        self.match_barcode = set(self.match_barcode_list)

        if args.panel:
            self.gene_list = utils.get_gene_region_from_bed(args.panel)[0]
            self.n_gene = len(self.gene_list)
        else:
            self.gene_list, self.n_gene = utils.read_one_col(args.gene_list)

        if not self.gene_list:
            sys.exit("You must provide either --panel or --gene_list!")

        self.count_dict = utils.genDict(dim=3, valType=int)

        self.add_metric(
            name="Number of Target Genes",
            value=self.n_gene,
        )
        self.add_metric(
            name="Number of Cells",
            value=self.n_cell,
        )

        # out file
        self.out_bam_file = f'{self.out_prefix}_filtered.bam'
        self.out_bam_file_sorted = f'{self.out_prefix}_filtered_sorted.bam'
Beispiel #3
0
    def __init__(self, args, display_title=None):
        Step.__init__(self, args, display_title=display_title)

        # set
        self.chains = CHAINS[args.type]
        self.cols = []
        for chain in self.chains:
            for seq in SEQUENCES_HEADER:
                self.cols.append("_".join([seq, chain]))

        self.match_bool = False
        if args.match_dir and args.match_dir.strip() != 'None':
            self.match_cell_barcodes, _match_cell_number = utils.read_barcode_file(
                args.match_dir)
            self.match_bool = True
        elif args.matrix_dir and args.matrix_dir.strip() != 'None':
            self.match_cell_barcodes = utils.get_barcodes_from_matrix_dir(
                args.matrix_dir)
            self.match_bool = True
        if self.match_bool:
            self.match_cell_barcodes = set(self.match_cell_barcodes)

        # out files
        self.cell_confident_file = f"{self.out_prefix}_cell_confident.tsv"
        self.cell_confident_count_file = f"{self.out_prefix}_cell_confident_count.tsv"
        self.clonetypes_file = f"{self.out_prefix}_clonetypes.tsv"
        self.match_clonetypes_file = f"{self.out_prefix}_match_clonetypes.tsv"

        # add args data
        self.add_data(iUMI=args.iUMI)
Beispiel #4
0
def snpCalling(args):

    sample = args.sample
    outdir = args.outdir
    thread = int(args.thread)
    match_dir = args.match_dir
    bam = args.bam
    genomeDir = args.genomeDir
    gene_list_file = args.gene_list
    min_query_length = args.min_query_length

    # process args
    barcodes, _nCell = read_barcode_file(match_dir)

    # check dir
    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    # get genome file
    _refFlat, gtf, fasta = glob_genomeDir(genomeDir, fa=True)

    # convert gene
    gene_id_name_dic = convert(gene_list_file, gtf)

    # split bam
    index_file, count_file = split_bam(bam, barcodes, outdir, sample,
                                       gene_id_name_dic, min_query_length)

    # snp
    call_all_snp(index_file, outdir, thread, fasta)

    # summary
    summary(index_file, count_file, outdir, sample)
Beispiel #5
0
def mapping_hla(args):

    sample = args.sample
    outdir = args.outdir
    fq = args.fq
    thread = int(args.thread)
    match_dir = args.match_dir

    # process args
    barcodes, _nCell = read_barcode_file(match_dir)

    # check dir
    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    # razer
    out_bam = razer(fq, outdir, sample, thread)

    # split bam
    index_file, _count_file = split_bam(out_bam, barcodes, outdir, sample)

    # typing
    hla_typing(index_file, outdir, thread)

    # summary
    summary(index_file, outdir, sample)
Beispiel #6
0
    def __init__(self, sample, outdir, genomeDir, root_dir):
        self.sample = sample
        self.outdir = outdir

        # set
        match_dir = f'{root_dir}/{sample}'
        self.mt_gene_list_file = Mkref.parse_genomeDir(
            genomeDir)['mt_gene_list']
        _barcodes, self.ncell = utils.read_barcode_file(match_dir)
        self.bam = None
        try:
            self.bam = glob.glob(
                f'{match_dir}/03*/{sample}*sortedByCoord.out.bam')[0]
        except IndexError:
            print("STAR bam does not exist! Skip coverage summary.")

        self.matrix_dir = glob.glob(
            f'{match_dir}/*count/{sample}_matrix_10X')[0]

        # out
        if not os.path.exists(outdir):
            os.system(f'mkdir -p {outdir}')
        out_prefix = f'{outdir}/{sample}'
        self.mt_bam = f'{out_prefix}_mt.bam'
        self.mt_depth = f'{out_prefix}_mt_depth.tsv'
        self.coverage_plot = f'{out_prefix}_mt_coverage.png'
Beispiel #7
0
 def test_split_bam(self):
     bam = './S20070818_TS/04.featureCounts/S20070818_TS_name_sorted.bam'
     barcodes, _nCell = read_barcode_file(self.match_dir)
     gene_id_name_dic = convert(self.gene_list_file, self.gtf)
     min_query_length = 35
     split_bam(bam, barcodes, self.outdir, self.sample, gene_id_name_dic,
               min_query_length)
Beispiel #8
0
def count_capture_rna(args):

    # check
    _refFlat, gtf = glob_genomeDir(args.genomeDir)
    id_name = gene_convert(gtf)

    # 检查和创建输出目录
    if not os.path.exists(args.outdir):
        os.system('mkdir -p %s' % (args.outdir))

    # umi纠错,输出Barcode geneID  UMI     count为表头的表格
    count_detail_file = args.outdir + '/' + args.sample + '_count_detail.txt'
    df_probe = bam2table(args.bam, count_detail_file, id_name)
    df_probe.to_csv(f'{args.outdir}/{args.sample}_probe_gene_count.tsv',
                    sep='\t',
                    index=False)

    df = pd.read_table(count_detail_file, header=0)

    # call cells
    pdf = args.outdir + '/barcode_filter_magnitude.pdf'
    marked_counts_file = args.outdir + '/' + args.sample + '_counts.txt'
    (validated_barcodes, threshold, cell_num,
     CB_describe) = call_cells(df, args.cells, pdf, marked_counts_file)

    # match barcode
    sc_cell_barcodes, sc_cell_number = read_barcode_file(args.match_dir)

    # 输出matrix
    (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome,
     match_cell_str,
     match_UMI_median) = expression_matrix(df, validated_barcodes, args.outdir,
                                           args.sample, id_name,
                                           sc_cell_barcodes, sc_cell_number)

    # downsampling
    validated_barcodes = set(validated_barcodes)
    downsample_file = args.outdir + '/' + args.sample + '_downsample.txt'
    Saturation = downsample(count_detail_file, validated_barcodes,
                            downsample_file)

    # summary
    stat_file = args.outdir + '/stat.txt'
    get_summary(df, args.sample, Saturation, CB_describe, CB_total_Genes,
                CB_reads_count, reads_mapped_to_transcriptome, match_cell_str,
                match_UMI_median, stat_file, args.outdir + '/../')

    report_prepare(marked_counts_file, downsample_file, args.outdir + '/..')

    t = reporter(assay=args.assay,
                 name='count_capture_rna',
                 sample=args.sample,
                 stat_file=args.outdir + '/stat.txt',
                 outdir=args.outdir + '/..')
    t.get_report()
Beispiel #9
0
 def setUp(self):
     os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/0910_panel/')
     self.sample = 'S20071508_D_TS'
     count_detail_file = './/S20071508_D_TS/05.count_capture_rna/S20071508_D_TS_count_detail.txt'
     self.df = pd.read_table(count_detail_file, header=0)
     self.match_dir = '/SGRNJ02/RandD4/RD20051303_Panel/20200729/S20071508_D_ZL'
     self.sc_cell_barcodes, self.sc_cell_number = read_barcode_file(self.match_dir)
     self.outdir = f'{self.sample}/05.count_capture_rna/'
     self.genomeDir = '/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92'
     self.validated_barcodes, _ = read_one_col(f'{self.sample}/05.count_capture_rna/{self.sample}_matrix_10X/barcodes.tsv') 
     _refFlat, self.gtf = glob_genomeDir(self.genomeDir)
     self.assay = 'capture_rna'
Beispiel #10
0
 def test_matrix_10X(self):
     match_dir = '/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/unittest/rna/test2/'
     validated_barcodes, _ncell = read_barcode_file(match_dir)
     os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/unittest/rna/')
     df = pd.read_csv(
         '/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/unittest/rna/test2/05.count/test2_count_detail.txt.gz',
         sep='\t')
     outdir = 'test2/05.count'
     sample = 'test2'
     gtf_file = '/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92/Homo_sapiens.GRCh38.92.chr.gtf'
     matrix_10X(df,
                outdir,
                sample,
                gtf_file,
                dir_name='matrix_10X_new',
                validated_barcodes=validated_barcodes)
Beispiel #11
0
def split_fq(args):
    nCell = args.nCell
    outdir = args.outdir
    sample = args.sample
    match_dir = args.match_dir

    if match_dir and match_dir != 'None':
        barcodes, _nCell = read_barcode_file(args.match_dir)
    else:
        barcodes = ''
    fq_outdir = f'{args.outdir}/fastq'
    if nCell and nCell != 'None':
        nCell = int(nCell)
    bi = split_run(args.fq, fq_outdir, barcodes, nCell)
    index_file = f'{outdir}/{sample}_index.tsv'
    bi.df_index.to_csv(index_file, sep='\t')
Beispiel #12
0
    def __init__(self, args):
        Step.__init__(self, args)

        # set
        self.barcodes, _num = utils.read_barcode_file(args.match_dir)
        self.fasta = Mkref_rna.parse_genomeDir(args.genomeDir)['fasta']
        self.df_vcf = None
        self.panel = args.panel
        self.bed = utils.get_bed_file_path(self.panel)

        # out
        self.splitN_bam = f'{self.out_prefix}_splitN.bam'
        self.splitN_bam_name_sorted = f'{self.out_prefix}_splitN_name_sorted.bam'

        self.raw_bcf_file = f'{self.out_prefix}_raw.bcf'
        self.raw_vcf_file = f'{self.out_prefix}_raw.vcf'
        self.fixed_header_vcf = f'{self.out_prefix}_fixed.vcf'
        self.norm_vcf_file = f'{self.out_prefix}_norm.vcf'
Beispiel #13
0
    def __init__(
        self,
        sample,
        outdir,
        assay,
        read_count_file,
        match_dir,
    ):
        self.sample = sample
        self.outdir = outdir
        self.assay = assay
        self.read_count_file = read_count_file
        self.match_dir = match_dir
        self.match_barcode, self.cell_total = read_barcode_file(match_dir)
        self.df_read_count = pd.read_csv(read_count_file,
                                         sep="\t",
                                         index_col=0)
        self.tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0]

        if not os.path.exists(outdir):
            os.system('mkdir -p %s' % outdir)

        # out
        self.mtx = f'{outdir}/{sample}_citeseq.mtx.gz'
Beispiel #14
0
def count_vdj(args):

    sample = args.sample
    match_dir = args.match_dir
    UMI_min = args.UMI_min
    outdir = args.outdir
    UMI_count_filter1_file = args.UMI_count_filter1_file
    type = args.type
    debug = args.debug
    iUMI = int(args.iUMI)
    chains = CHAINS[type]

    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % outdir)

    # out file
    cell_confident_file = f"{outdir}/{sample}_cell_confident.tsv"
    cell_confident_count_file = f"{outdir}/{sample}_cell_confident_count.tsv"
    clonetypes_file = f"{outdir}/{sample}_clonetypes.tsv"
    match_clonetypes_file = f"{outdir}/{sample}_match_clonetypes.tsv"
    top10_clonetypes_file = f"{outdir}/{sample}_top10_clonetypes.tsv"
    match_top10_clonetypes_file = f"{outdir}/{sample}_match_top10_clonetypes.tsv"

    # read file
    df_UMI_count_filter1 = pd.read_csv(UMI_count_filter1_file, sep='\t')
    if (not match_dir) or (match_dir == "None"):
        match_bool = False
    else:
        match_bool = True
    if match_bool:
        match_cell_barcodes, match_cell_number = read_barcode_file(match_dir)

    cell_summary_row_list = []

    # cell calling:cell calling: keep UMIs >= UMI_min
    df_UMI_sum = df_UMI_count_filter1.groupby(['barcode'], as_index=False).agg(
        {"UMI": "sum"})
    if (UMI_min == "auto"):
        rank = 20
        df_UMI_sum_sorted = df_UMI_sum.sort_values(["UMI"], ascending=False)
        rank_UMI = df_UMI_sum_sorted.iloc[rank, :]["UMI"]
        UMI_min = int(rank_UMI / 10)
    else:
        UMI_min = int(UMI_min)
    df_UMI_cell = df_UMI_sum[df_UMI_sum.UMI >= UMI_min]
    df_UMI_sum["mark"] = df_UMI_sum["UMI"].apply(lambda x: "CB"
                                                 if (x >= UMI_min) else "UB")
    report_prepare(df_UMI_sum, outdir + "/../")

    cell_barcodes = set(df_UMI_cell.barcode)
    cell_number = len(cell_barcodes)
    cell_summary_row_list.append({
        "item": "Estimated Number of Cells",
        "count": cell_number,
        "total_count": cell_number,
    })

    # df_UMI_count_filter1 in cell
    df_cell = df_UMI_count_filter1[df_UMI_count_filter1.barcode.isin(
        cell_barcodes)]
    # filter2: cell wtih UMI >= iUMI of identical receptor type and CDR3
    # combinations.
    df_cell_UMI_count_filter2 = df_cell[df_cell.UMI >= iUMI]

    # cell confident
    df_cell_confident = df_cell_UMI_count_filter2[
        df_cell_UMI_count_filter2["chain"].isin(chains)]
    df_cell_confident = df_cell_confident.sort_values("UMI", ascending=False)
    df_cell_confident = df_cell_confident.groupby(["barcode", "chain"],
                                                  as_index=False).head(1)

    # count
    df_cell_confident_count = df_cell_confident.set_index(["barcode", "chain"])
    df_cell_confident_count = df_cell_confident_count.unstack()
    df_cell_confident_count.columns = [
        '_'.join(col) for col in df_cell_confident_count
    ]
    df_cell_confident_count = df_cell_confident_count.reset_index()
    df_cell_confident_count.fillna(inplace=True, value="NA")

    # clonetypes
    seqs = ["aaSeqCDR3", "nSeqCDR3"]
    cols = []
    for chain in chains:
        for seq in seqs:
            cols.append("_".join([seq, chain]))

    for col in cols:
        if not (col in list(df_cell_confident_count.columns)):
            df_cell_confident_count[col] = "NA"

    df_clonetypes = df_cell_confident_count.copy()

    df_clonetypes = df_clonetypes.groupby(cols, as_index=False).agg(
        {"barcode": "count"})
    # put na last
    df_clonetypes.replace('NA', np.nan, inplace=True)
    df_clonetypes.sort_values(["barcode"] + cols,
                              ascending=False,
                              na_position='last',
                              inplace=True)
    df_clonetypes.replace(np.nan, 'NA', inplace=True)

    total_CDR3_barcode_number = sum(df_clonetypes.barcode)
    df_clonetypes["percent"] = df_clonetypes.barcode / \
        total_CDR3_barcode_number * 100
    df_clonetypes["percent"] = df_clonetypes["percent"].apply(
        lambda x: round(x, 2))

    # add clonetype ID
    df_clonetypes = df_clonetypes.reset_index()
    df_clonetypes["clonetype_ID"] = pd.Series(df_clonetypes.index) + 1
    df_clonetypes.drop(columns=["index"], inplace=True)

    # order
    order = ["clonetype_ID"] + cols + ["barcode", "percent"]
    df_clonetypes = df_clonetypes[order]
    df_clonetypes.rename(columns={"barcode": "barcode_count"}, inplace=True)
    # out clonetypes
    df_clonetypes.to_csv(clonetypes_file, sep="\t", index=False)

    if type == "TCR":

        UMI_col_dic = {"TRA": "UMI_TRA", "TRB": "UMI_TRB"}
        for chain in UMI_col_dic:
            UMI_col_name = UMI_col_dic[chain]
            if UMI_col_name in df_cell_confident_count.columns:
                df_cell_confident_count[UMI_col_name].replace("NA",
                                                              0,
                                                              inplace=True)
                Median_chain_UMIs_per_Cell = np.median(
                    df_cell_confident_count[UMI_col_name])
            else:
                Median_chain_UMIs_per_Cell = 0
            cell_summary_row_list.append({
                "item":
                "Median {chain} UMIs per Cell".format(chain=chain),
                "count":
                Median_chain_UMIs_per_Cell,
                "total_count":
                np.nan
            })

        df_TRA_TRB = df_cell_confident_count[
            (df_cell_confident_count.aaSeqCDR3_TRA != "NA")
            & (df_cell_confident_count.aaSeqCDR3_TRB != "NA")]
        cell_with_confident_TRA_and_TRB = df_TRA_TRB.shape[0]
        cell_summary_row_list.append({
            "item": "Cell with TRA and TRB",
            "count": cell_with_confident_TRA_and_TRB,
            "total_count": cell_number,
        })
        """
        df cell barcode filter
        intersect cell_barcodes from scRNA-Seq with barcode from TCR seq
        """
        if match_bool:
            cell_with_match_barcode = match_cell_barcodes.intersection(
                cell_barcodes)
            cell_with_match_barcode_number = len(cell_with_match_barcode)

            df_match = df_cell_confident_count[
                df_cell_confident_count.barcode.isin(match_cell_barcodes)]

            df_match_TRA_TRB = df_match[(df_match.aaSeqCDR3_TRA != "NA")
                                        & (df_match.aaSeqCDR3_TRB != "NA")]
            match_cell_with_TRA_and_TRB = df_match_TRA_TRB.shape[0]

            cell_summary_row_list.append({
                "item": "Cell with Barcode Match",
                "count": cell_with_match_barcode_number,
                "total_count": cell_number,
            })
            cell_summary_row_list.append({
                "item": "Cell with Barcode Match, TRA and TRB",
                "count": match_cell_with_TRA_and_TRB,
                "total_count": cell_number,
            })

    # BCR
    elif type == "BCR":

        UMI_col_dic = {"IGH": "UMI_IGH", "IGL": "UMI_IGL", "IGK": "UMI_IGK"}
        for chain in UMI_col_dic:
            UMI_col_name = UMI_col_dic[chain]
            if UMI_col_name in df_cell_confident_count.columns:
                df_cell_confident_count[UMI_col_name].replace("NA",
                                                              0,
                                                              inplace=True)
                df_cell_confident_count_over_zero = df_cell_confident_count[
                    df_cell_confident_count[UMI_col_name] > 0]
                Median_chain_UMIs_per_Cell = np.median(
                    df_cell_confident_count_over_zero[UMI_col_name])
            else:
                Median_chain_UMIs_per_Cell = 0
            cell_summary_row_list.append({
                "item":
                "Median {chain} UMIs per Cell".format(chain=chain),
                "count":
                Median_chain_UMIs_per_Cell,
                "total_count":
                np.nan
            })

        df_heavy_and_light = df_cell_confident_count[
            (df_cell_confident_count.aaSeqCDR3_IGH != "NA")
            & ((df_cell_confident_count.aaSeqCDR3_IGL != "NA")
               | (df_cell_confident_count.aaSeqCDR3_IGK != "NA"))]
        Cell_with_Heavy_and_Light_Chain = df_heavy_and_light.shape[0]
        cell_summary_row_list.append({
            "item": "Cell with Heavy and Light Chain",
            "count": Cell_with_Heavy_and_Light_Chain,
            "total_count": cell_number
        })
        """
        df cell barcode filter
        intersect cell_barcodes from normal scRNA-Seq with barcode from BCR seq
        """
        if match_bool:
            cell_with_match_barcode = match_cell_barcodes.intersection(
                cell_barcodes)
            cell_with_match_barcode_number = len(cell_with_match_barcode)

            df_match = df_cell_confident_count[
                df_cell_confident_count.barcode.isin(match_cell_barcodes)]

            # median match UMI
            df_match_heavy_light = df_match[
                (df_match.aaSeqCDR3_IGH != "NA")
                & ((df_match.aaSeqCDR3_IGL != "NA")
                   | (df_match.aaSeqCDR3_IGK != "NA"))]
            match_cell_with_heavy_and_light = df_match_heavy_light.shape[0]

            cell_summary_row_list.append({
                "item": "Cell with Barcode Match ",
                "count": cell_with_match_barcode_number,
                "total_count": cell_number
            })
            cell_summary_row_list.append({
                "item": "Cell with Barcode Match, Heavy and Light Chain",
                "count": match_cell_with_heavy_and_light,
                "total_count": cell_number
            })

    if match_bool:
        """
        df_match_clonetypes
        """
        df_match_clonetypes = df_match.groupby(cols, as_index=False).agg(
            {"barcode": "count"})
        total_match_CDR3_barcode_number = sum(df_match_clonetypes.barcode)
        df_match_clonetypes["percent"] = df_match_clonetypes.barcode / \
            total_match_CDR3_barcode_number * 100
        df_match_clonetypes["percent"] = df_match_clonetypes["percent"].apply(
            lambda x: round(x, 2))
        df_match_clonetypes.rename(columns={"barcode": "barcode_count"},
                                   inplace=True)
        df_match_clonetypes = df_match_clonetypes.merge(df_clonetypes,
                                                        on=cols,
                                                        how='left',
                                                        suffixes=('', '_y'))
        # order and drop duplicated cols
        order = ["clonetype_ID"] + cols + ["barcode_count", "percent"]
        df_match_clonetypes = df_match_clonetypes[order]
        df_match_clonetypes.sort_values(["barcode_count", "clonetype_ID"],
                                        ascending=[False, True],
                                        inplace=True)
        df_match_clonetypes.to_csv(match_clonetypes_file,
                                   sep="\t",
                                   index=False)

    df_mergeID = pd.merge(df_cell_confident_count,
                          df_clonetypes,
                          how="left",
                          on=cols)
    df_mergeID.sort_values(["clonetype_ID", "barcode"], inplace=True)
    # output df_cell_confident_count
    df_mergeID.to_csv(cell_confident_count_file, sep="\t", index=False)
    df_mergeID = df_mergeID[["barcode", "clonetype_ID"]]
    df_cell_confident_with_ID = pd.merge(df_cell_confident,
                                         df_mergeID,
                                         how="left",
                                         on="barcode")
    df_cell_confident_with_ID.sort_values(["clonetype_ID", "barcode", "chain"],
                                          inplace=True)
    # output df_cell_confident
    df_cell_confident_with_ID.to_csv(cell_confident_file,
                                     sep="\t",
                                     index=False)

    # summary file
    cell_summary = pd.DataFrame(cell_summary_row_list,
                                columns=["item", "count", "total_count"])
    cell_summary["count"] = cell_summary["count"].apply(int)
    cell_summary["percent"] = cell_summary["count"] / \
        (cell_summary.total_count.astype("float")) * 100
    cell_summary["percent"] = cell_summary["percent"].apply(
        lambda x: round(x, 2))
    cell_summary["count"] = cell_summary["count"].apply(format_number)

    def percent_str_func(row):
        need_percent = bool(
            re.search("Cell with", row["item"], flags=re.IGNORECASE))
        if need_percent:
            return "(" + str(row["percent"]) + "%)"
        else:
            return ""

    cell_summary["percent_str"] = cell_summary.apply(
        lambda row: percent_str_func(row), axis=1)

    # stat file
    def gen_stat(summary, stat_file):
        stat = summary
        stat["new_count"] = stat["count"].astype(str) + stat["percent_str"]
        stat = stat.loc[:, ["item", "new_count"]]
        stat.to_csv(stat_file, sep=":", header=None, index=False)

    cell_stat_file = "{}/stat.txt".format(outdir)
    gen_stat(cell_summary, cell_stat_file)
    name = type + '_count_vdj'
    t = reporter(
        name=name,
        sample=args.sample,
        stat_file=cell_stat_file,
        outdir=outdir + '/..',
        assay=args.assay,
        parameters={"iUMI": iUMI},
    )
    t.get_report()

    # cloneytpes table
    def format_table(df_clonetypes, top10_clonetypes_file):
        top10_clonetypes_df = df_clonetypes.head(10)
        top10_clonetypes_df = top10_clonetypes_df.reset_index(drop=True)
        top10_clonetypes_df.index = top10_clonetypes_df.index + 1
        top10_clonetypes_df["percent"] = top10_clonetypes_df["percent"].apply(
            lambda x: str(x) + "%")
        seqs = ["aaSeqCDR3"]
        cols = []
        for chain in chains:
            for seq in seqs:
                cols.append("_".join([seq, chain]))
        top10_cols = ["clonetype_ID"] + cols + ["barcode_count", "percent"]
        top10_clonetypes_df = top10_clonetypes_df[top10_cols]
        top10_clonetypes_df.to_csv(top10_clonetypes_file,
                                   sep="\t",
                                   index=False)
        table_header = ["Clonetype_ID"] + cols + ["Frequency", "Percent"]
        return table_header

    table_header = format_table(df_clonetypes, top10_clonetypes_file)
    use_top10_clonetypes_file = top10_clonetypes_file
    section_header = 'Top10 clonetypes'
    if match_bool:
        format_table(df_match_clonetypes, match_top10_clonetypes_file)
        use_top10_clonetypes_file = match_top10_clonetypes_file
        section_header = 'Match Top10 clonetypes'

    t = reporter(
        name="clonetypes",
        sample=args.sample,
        table_file=use_top10_clonetypes_file,
        table_header=table_header,
        outdir=outdir + '/..',
        assay=args.assay,
        parameters={'section_header': section_header},
    )
    t.get_report()

    # other_metrics_file
    """
Beispiel #15
0
 def test_Barcode_index(self):
     barcodes, _nCell = read_barcode_file(self.match_dir)
     bi = Barcode_index(barcodes)
     bi.write_index('test_bi.tsv')
Beispiel #16
0
def count_fusion(args):

    outdir = args.outdir
    sample = args.sample
    bam = args.bam
    flanking_base = int(args.flanking_base)
    fusion_pos_file = args.fusion_pos
    match_dir = args.match_dir
    UMI_min = int(args.UMI_min)

    # check dir
    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    fusion_pos = read_pos(fusion_pos_file)
    out_prefix = outdir + "/" + sample
    # barcode
    match_barcode, _n_barcode = read_barcode_file(match_dir)
    # tsne
    match_tsne_file = parse_match_dir(match_dir)['tsne_coord']
    df_tsne = pd.read_csv(match_tsne_file, sep="\t", index_col=0)
    # out
    out_read_count_file = out_prefix + "_fusion_read_count.tsv"
    out_umi_count_file = out_prefix + "_fusion_UMI_count.tsv"
    out_barcode_count_file = out_prefix + "_fusion_barcode_count.tsv"
    out_tsne_file = out_prefix + "_fusion_tsne.tsv"

    # process bam
    samfile = pysam.AlignmentFile(bam, "rb")
    header = samfile.header
    new_bam = pysam.AlignmentFile(out_prefix + "_fusion.bam",
                                  "wb",
                                  header=header)
    count_dic = genDict(dim=3)
    for read in samfile:
        tag = read.reference_name
        read_start = int(read.reference_start)
        read_length = len(read.query_sequence)
        attr = read.query_name.split('_')
        barcode = attr[0]
        umi = attr[1]
        if tag in fusion_pos.keys():
            if barcode in match_barcode:
                if is_fusion(pos=fusion_pos[tag],
                             read_start=read_start,
                             read_length=read_length,
                             flanking_base=flanking_base):
                    new_bam.write(read)
                    count_dic[barcode][tag][umi] += 1
    new_bam.close()

    # write dic to pandas df
    rows = []
    for barcode in count_dic:
        for tag in count_dic[barcode]:
            for umi in count_dic[barcode][tag]:
                rows.append([barcode, tag, umi, count_dic[barcode][tag][umi]])
    df_read = pd.DataFrame(rows)
    df_read.rename(columns={
        0: "barcode",
        1: "tag",
        2: "UMI",
        3: "read_count"
    },
                   inplace=True)
    df_read.to_csv(out_read_count_file, sep="\t", index=False)

    if not rows:
        count_fusion.logger.error('***** NO FUSION FOUND! *****')
    else:
        df_umi = df_read.groupby(["barcode", "tag"]).agg({"UMI": "count"})
        df_umi = df_umi[df_umi["UMI"] >= UMI_min]
        df_umi.to_csv(out_umi_count_file, sep="\t")

        df_umi.reset_index(inplace=True)
        df_barcode = df_umi.groupby(["tag"]).agg({"barcode": "count"})
        n_match_barcode = len(match_barcode)
        # add zero count tag
        for tag in fusion_pos.keys():
            if not tag in df_barcode.barcode:
                new_row = pd.Series(data={'barcode': 0}, name=tag)
                df_barcode = df_barcode.append(new_row, ignore_index=False)
        df_barcode["percent"] = df_barcode["barcode"] / n_match_barcode
        df_barcode.to_csv(out_barcode_count_file, sep="\t")

        df_pivot = df_umi.pivot(index="barcode", columns="tag", values="UMI")
        df_pivot.fillna(0, inplace=True)
        df_tsne_fusion = pd.merge(df_tsne,
                                  df_pivot,
                                  right_index=True,
                                  left_index=True,
                                  how="left")
        df_tsne_fusion.fillna(0, inplace=True)
        df_tsne_fusion.to_csv(out_tsne_file, sep="\t")

        # plot
        count_fusion.logger.info("plot fusion...!")
        app = fusionDir + "/plot_fusion.R"
        cmd = f"Rscript {app} --tsne_fusion {out_tsne_file} --outdir {outdir}"
        os.system(cmd)
        count_fusion.logger.info("plot done.")
Beispiel #17
0
def count_smk(args):

    read_file = args.read_file
    match_dir = args.match_dir
    tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0]
    UMI_min = args.UMI_min
    SNR_min = args.SNR_min
    dim = int(args.dim)
    combine_cluster = args.combine_cluster
    outdir = args.outdir
    sample = args.sample
    assay = args.assay

    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    # stat_row
    stats = pd.Series()

    # process
    match_barcode, cell_total = read_barcode_file(match_dir)

    UMI_tag_file = f'{outdir}/{sample}_umi_tag.tsv'
    tsne_tag_file = f'{outdir}/{sample}_tsne_tag.tsv'
    cluster_count_file = f'{outdir}/{sample}_cluster_count.tsv'
    cluster_plot = f'{outdir}/{sample}_cluster_plot.pdf'
    if combine_cluster:
        combine_cluster_count_file = f'{outdir}/{sample}_combine_cluster_count.tsv'
        combine_cluster_plot = f'{outdir}/{sample}_combine_cluster_plot.pdf'

    df_read_count = pd.read_csv(read_file, sep="\t", index_col=0)
    mapped_read = df_read_count['read_count'].sum()

    # in cell
    df_read_count_in_cell = df_read_count[df_read_count.index.isin(
        match_barcode)]
    mapped_read_in_cell = int(df_read_count_in_cell['read_count'].sum())
    stats = stats.append(
        pd.Series(format_stat(mapped_read_in_cell, mapped_read),
                  index=['Mapped Reads in Cells']))

    # UMI
    df_UMI_in_cell = df_read_count_in_cell.reset_index().groupby(
        ['barcode', 'SMK_barcode_name']).agg({'UMI': 'count'})
    df_UMI_in_cell = df_UMI_in_cell.reset_index()
    df_UMI_in_cell = df_UMI_in_cell.pivot(index='barcode',
                                          columns='SMK_barcode_name',
                                          values='UMI')
    df_cell = pd.DataFrame(index=match_barcode)
    df_UMI_cell = pd.merge(df_cell,
                           df_UMI_in_cell,
                           how="left",
                           left_index=True,
                           right_index=True)

    # fillna
    df_UMI_cell.fillna(0, inplace=True)
    df_UMI_cell = df_UMI_cell.astype(int)

    # UMI
    UMIs = df_UMI_cell.apply(sum, axis=1)
    median = round(np.median(UMIs), 2)
    mean = round(np.mean(UMIs), 2)
    stats = stats.append(pd.Series(str(median), index=['Median UMI per Cell']))

    stats = stats.append(pd.Series(str(mean), index=['Mean UMI per Cell']))

    UMI_min = get_UMI_min(df_UMI_cell, UMI_min)
    count_smk.logger.info(f'UMI_min: {UMI_min}')
    SNR_min = get_SNR_min(df_UMI_cell, dim, SNR_min, UMI_min)
    count_smk.logger.info(f'SNR_min: {SNR_min}')
    df_UMI_cell["tag"] = df_UMI_cell.apply(tag_type,
                                           UMI_min=UMI_min,
                                           SNR_min=SNR_min,
                                           dim=dim,
                                           axis=1)
    df_UMI_cell.to_csv(UMI_tag_file, sep="\t")

    df_tsne = pd.read_csv(tsne_file, sep="\t", index_col=0)
    df_tsne_tag = pd.merge(df_tsne,
                           df_UMI_cell,
                           how="left",
                           left_index=True,
                           right_index=True)

    if combine_cluster:
        df_combine_cluster = pd.read_csv(combine_cluster,
                                         sep="\t",
                                         header=None)
        df_combine_cluster.columns = ["cluster", "combine_cluster"]
        df_tsne_combine_cluster_tag = pd.merge(df_tsne_tag,
                                               df_combine_cluster,
                                               on=["cluster"],
                                               how="left",
                                               left_index=True).set_index(
                                                   df_tsne_tag.index)
        df_tsne_combine_cluster_tag.to_csv(tsne_tag_file, sep="\t")
    else:
        df_tsne_tag.to_csv(tsne_tag_file, sep="\t")

    write_and_plot(df=df_tsne_tag,
                   column_name="cluster",
                   count_file=cluster_count_file,
                   plot_file=cluster_plot)

    if combine_cluster:
        write_and_plot(df=df_tsne_combine_cluster_tag,
                       column_name="combine_cluster",
                       count_file=combine_cluster_count_file,
                       plot_file=combine_cluster_plot)

    df_tag_count = df_UMI_cell["tag"].value_counts().reset_index()
    df_tag_count.columns = ["item", "count"]
    for index, row in df_tag_count.iterrows():
        stats = stats.append(
            pd.Series(format_stat(row['count'], cell_total),
                      index=[row['item'] + ' Cells']))
    stat_file = f'{outdir}/stat.txt'
    stats.to_csv(stat_file, sep=':', header=False)

    t = reporter(name='count_smk',
                 assay=assay,
                 sample=sample,
                 stat_file=stat_file,
                 outdir=outdir + '/..')
    t.get_report()