def __init__( self, sample, outdir, assay, read_count_file, match_dir, UMI_min, SNR_min, combine_cluster, dim, ): self.sample = sample self.outdir = outdir self.assay = assay self.read_count_file = read_count_file self.match_dir = match_dir self.UMI_min = UMI_min self.SNR_min = SNR_min self.combine_cluster = combine_cluster self.dim = int(dim) self.match_barcode, self.cell_total = read_barcode_file(match_dir) self.df_read_count = pd.read_csv(read_count_file, sep="\t", index_col=0) self.tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0] self.no_noise = False if not os.path.exists(outdir): os.system('mkdir -p %s' % outdir)
def __init__(self, args, display_title=None): Step.__init__(self, args, display_title=display_title) # set self.match_barcode_list, self.n_cell = utils.read_barcode_file(args.match_dir) self.match_barcode = set(self.match_barcode_list) if args.panel: self.gene_list = utils.get_gene_region_from_bed(args.panel)[0] self.n_gene = len(self.gene_list) else: self.gene_list, self.n_gene = utils.read_one_col(args.gene_list) if not self.gene_list: sys.exit("You must provide either --panel or --gene_list!") self.count_dict = utils.genDict(dim=3, valType=int) self.add_metric( name="Number of Target Genes", value=self.n_gene, ) self.add_metric( name="Number of Cells", value=self.n_cell, ) # out file self.out_bam_file = f'{self.out_prefix}_filtered.bam' self.out_bam_file_sorted = f'{self.out_prefix}_filtered_sorted.bam'
def __init__(self, args, display_title=None): Step.__init__(self, args, display_title=display_title) # set self.chains = CHAINS[args.type] self.cols = [] for chain in self.chains: for seq in SEQUENCES_HEADER: self.cols.append("_".join([seq, chain])) self.match_bool = False if args.match_dir and args.match_dir.strip() != 'None': self.match_cell_barcodes, _match_cell_number = utils.read_barcode_file( args.match_dir) self.match_bool = True elif args.matrix_dir and args.matrix_dir.strip() != 'None': self.match_cell_barcodes = utils.get_barcodes_from_matrix_dir( args.matrix_dir) self.match_bool = True if self.match_bool: self.match_cell_barcodes = set(self.match_cell_barcodes) # out files self.cell_confident_file = f"{self.out_prefix}_cell_confident.tsv" self.cell_confident_count_file = f"{self.out_prefix}_cell_confident_count.tsv" self.clonetypes_file = f"{self.out_prefix}_clonetypes.tsv" self.match_clonetypes_file = f"{self.out_prefix}_match_clonetypes.tsv" # add args data self.add_data(iUMI=args.iUMI)
def snpCalling(args): sample = args.sample outdir = args.outdir thread = int(args.thread) match_dir = args.match_dir bam = args.bam genomeDir = args.genomeDir gene_list_file = args.gene_list min_query_length = args.min_query_length # process args barcodes, _nCell = read_barcode_file(match_dir) # check dir if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) # get genome file _refFlat, gtf, fasta = glob_genomeDir(genomeDir, fa=True) # convert gene gene_id_name_dic = convert(gene_list_file, gtf) # split bam index_file, count_file = split_bam(bam, barcodes, outdir, sample, gene_id_name_dic, min_query_length) # snp call_all_snp(index_file, outdir, thread, fasta) # summary summary(index_file, count_file, outdir, sample)
def mapping_hla(args): sample = args.sample outdir = args.outdir fq = args.fq thread = int(args.thread) match_dir = args.match_dir # process args barcodes, _nCell = read_barcode_file(match_dir) # check dir if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) # razer out_bam = razer(fq, outdir, sample, thread) # split bam index_file, _count_file = split_bam(out_bam, barcodes, outdir, sample) # typing hla_typing(index_file, outdir, thread) # summary summary(index_file, outdir, sample)
def __init__(self, sample, outdir, genomeDir, root_dir): self.sample = sample self.outdir = outdir # set match_dir = f'{root_dir}/{sample}' self.mt_gene_list_file = Mkref.parse_genomeDir( genomeDir)['mt_gene_list'] _barcodes, self.ncell = utils.read_barcode_file(match_dir) self.bam = None try: self.bam = glob.glob( f'{match_dir}/03*/{sample}*sortedByCoord.out.bam')[0] except IndexError: print("STAR bam does not exist! Skip coverage summary.") self.matrix_dir = glob.glob( f'{match_dir}/*count/{sample}_matrix_10X')[0] # out if not os.path.exists(outdir): os.system(f'mkdir -p {outdir}') out_prefix = f'{outdir}/{sample}' self.mt_bam = f'{out_prefix}_mt.bam' self.mt_depth = f'{out_prefix}_mt_depth.tsv' self.coverage_plot = f'{out_prefix}_mt_coverage.png'
def test_split_bam(self): bam = './S20070818_TS/04.featureCounts/S20070818_TS_name_sorted.bam' barcodes, _nCell = read_barcode_file(self.match_dir) gene_id_name_dic = convert(self.gene_list_file, self.gtf) min_query_length = 35 split_bam(bam, barcodes, self.outdir, self.sample, gene_id_name_dic, min_query_length)
def count_capture_rna(args): # check _refFlat, gtf = glob_genomeDir(args.genomeDir) id_name = gene_convert(gtf) # 检查和创建输出目录 if not os.path.exists(args.outdir): os.system('mkdir -p %s' % (args.outdir)) # umi纠错,输出Barcode geneID UMI count为表头的表格 count_detail_file = args.outdir + '/' + args.sample + '_count_detail.txt' df_probe = bam2table(args.bam, count_detail_file, id_name) df_probe.to_csv(f'{args.outdir}/{args.sample}_probe_gene_count.tsv', sep='\t', index=False) df = pd.read_table(count_detail_file, header=0) # call cells pdf = args.outdir + '/barcode_filter_magnitude.pdf' marked_counts_file = args.outdir + '/' + args.sample + '_counts.txt' (validated_barcodes, threshold, cell_num, CB_describe) = call_cells(df, args.cells, pdf, marked_counts_file) # match barcode sc_cell_barcodes, sc_cell_number = read_barcode_file(args.match_dir) # 输出matrix (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome, match_cell_str, match_UMI_median) = expression_matrix(df, validated_barcodes, args.outdir, args.sample, id_name, sc_cell_barcodes, sc_cell_number) # downsampling validated_barcodes = set(validated_barcodes) downsample_file = args.outdir + '/' + args.sample + '_downsample.txt' Saturation = downsample(count_detail_file, validated_barcodes, downsample_file) # summary stat_file = args.outdir + '/stat.txt' get_summary(df, args.sample, Saturation, CB_describe, CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome, match_cell_str, match_UMI_median, stat_file, args.outdir + '/../') report_prepare(marked_counts_file, downsample_file, args.outdir + '/..') t = reporter(assay=args.assay, name='count_capture_rna', sample=args.sample, stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..') t.get_report()
def setUp(self): os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/0910_panel/') self.sample = 'S20071508_D_TS' count_detail_file = './/S20071508_D_TS/05.count_capture_rna/S20071508_D_TS_count_detail.txt' self.df = pd.read_table(count_detail_file, header=0) self.match_dir = '/SGRNJ02/RandD4/RD20051303_Panel/20200729/S20071508_D_ZL' self.sc_cell_barcodes, self.sc_cell_number = read_barcode_file(self.match_dir) self.outdir = f'{self.sample}/05.count_capture_rna/' self.genomeDir = '/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92' self.validated_barcodes, _ = read_one_col(f'{self.sample}/05.count_capture_rna/{self.sample}_matrix_10X/barcodes.tsv') _refFlat, self.gtf = glob_genomeDir(self.genomeDir) self.assay = 'capture_rna'
def test_matrix_10X(self): match_dir = '/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/unittest/rna/test2/' validated_barcodes, _ncell = read_barcode_file(match_dir) os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/unittest/rna/') df = pd.read_csv( '/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/unittest/rna/test2/05.count/test2_count_detail.txt.gz', sep='\t') outdir = 'test2/05.count' sample = 'test2' gtf_file = '/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92/Homo_sapiens.GRCh38.92.chr.gtf' matrix_10X(df, outdir, sample, gtf_file, dir_name='matrix_10X_new', validated_barcodes=validated_barcodes)
def split_fq(args): nCell = args.nCell outdir = args.outdir sample = args.sample match_dir = args.match_dir if match_dir and match_dir != 'None': barcodes, _nCell = read_barcode_file(args.match_dir) else: barcodes = '' fq_outdir = f'{args.outdir}/fastq' if nCell and nCell != 'None': nCell = int(nCell) bi = split_run(args.fq, fq_outdir, barcodes, nCell) index_file = f'{outdir}/{sample}_index.tsv' bi.df_index.to_csv(index_file, sep='\t')
def __init__(self, args): Step.__init__(self, args) # set self.barcodes, _num = utils.read_barcode_file(args.match_dir) self.fasta = Mkref_rna.parse_genomeDir(args.genomeDir)['fasta'] self.df_vcf = None self.panel = args.panel self.bed = utils.get_bed_file_path(self.panel) # out self.splitN_bam = f'{self.out_prefix}_splitN.bam' self.splitN_bam_name_sorted = f'{self.out_prefix}_splitN_name_sorted.bam' self.raw_bcf_file = f'{self.out_prefix}_raw.bcf' self.raw_vcf_file = f'{self.out_prefix}_raw.vcf' self.fixed_header_vcf = f'{self.out_prefix}_fixed.vcf' self.norm_vcf_file = f'{self.out_prefix}_norm.vcf'
def __init__( self, sample, outdir, assay, read_count_file, match_dir, ): self.sample = sample self.outdir = outdir self.assay = assay self.read_count_file = read_count_file self.match_dir = match_dir self.match_barcode, self.cell_total = read_barcode_file(match_dir) self.df_read_count = pd.read_csv(read_count_file, sep="\t", index_col=0) self.tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0] if not os.path.exists(outdir): os.system('mkdir -p %s' % outdir) # out self.mtx = f'{outdir}/{sample}_citeseq.mtx.gz'
def count_vdj(args): sample = args.sample match_dir = args.match_dir UMI_min = args.UMI_min outdir = args.outdir UMI_count_filter1_file = args.UMI_count_filter1_file type = args.type debug = args.debug iUMI = int(args.iUMI) chains = CHAINS[type] if not os.path.exists(outdir): os.system('mkdir -p %s' % outdir) # out file cell_confident_file = f"{outdir}/{sample}_cell_confident.tsv" cell_confident_count_file = f"{outdir}/{sample}_cell_confident_count.tsv" clonetypes_file = f"{outdir}/{sample}_clonetypes.tsv" match_clonetypes_file = f"{outdir}/{sample}_match_clonetypes.tsv" top10_clonetypes_file = f"{outdir}/{sample}_top10_clonetypes.tsv" match_top10_clonetypes_file = f"{outdir}/{sample}_match_top10_clonetypes.tsv" # read file df_UMI_count_filter1 = pd.read_csv(UMI_count_filter1_file, sep='\t') if (not match_dir) or (match_dir == "None"): match_bool = False else: match_bool = True if match_bool: match_cell_barcodes, match_cell_number = read_barcode_file(match_dir) cell_summary_row_list = [] # cell calling:cell calling: keep UMIs >= UMI_min df_UMI_sum = df_UMI_count_filter1.groupby(['barcode'], as_index=False).agg( {"UMI": "sum"}) if (UMI_min == "auto"): rank = 20 df_UMI_sum_sorted = df_UMI_sum.sort_values(["UMI"], ascending=False) rank_UMI = df_UMI_sum_sorted.iloc[rank, :]["UMI"] UMI_min = int(rank_UMI / 10) else: UMI_min = int(UMI_min) df_UMI_cell = df_UMI_sum[df_UMI_sum.UMI >= UMI_min] df_UMI_sum["mark"] = df_UMI_sum["UMI"].apply(lambda x: "CB" if (x >= UMI_min) else "UB") report_prepare(df_UMI_sum, outdir + "/../") cell_barcodes = set(df_UMI_cell.barcode) cell_number = len(cell_barcodes) cell_summary_row_list.append({ "item": "Estimated Number of Cells", "count": cell_number, "total_count": cell_number, }) # df_UMI_count_filter1 in cell df_cell = df_UMI_count_filter1[df_UMI_count_filter1.barcode.isin( cell_barcodes)] # filter2: cell wtih UMI >= iUMI of identical receptor type and CDR3 # combinations. df_cell_UMI_count_filter2 = df_cell[df_cell.UMI >= iUMI] # cell confident df_cell_confident = df_cell_UMI_count_filter2[ df_cell_UMI_count_filter2["chain"].isin(chains)] df_cell_confident = df_cell_confident.sort_values("UMI", ascending=False) df_cell_confident = df_cell_confident.groupby(["barcode", "chain"], as_index=False).head(1) # count df_cell_confident_count = df_cell_confident.set_index(["barcode", "chain"]) df_cell_confident_count = df_cell_confident_count.unstack() df_cell_confident_count.columns = [ '_'.join(col) for col in df_cell_confident_count ] df_cell_confident_count = df_cell_confident_count.reset_index() df_cell_confident_count.fillna(inplace=True, value="NA") # clonetypes seqs = ["aaSeqCDR3", "nSeqCDR3"] cols = [] for chain in chains: for seq in seqs: cols.append("_".join([seq, chain])) for col in cols: if not (col in list(df_cell_confident_count.columns)): df_cell_confident_count[col] = "NA" df_clonetypes = df_cell_confident_count.copy() df_clonetypes = df_clonetypes.groupby(cols, as_index=False).agg( {"barcode": "count"}) # put na last df_clonetypes.replace('NA', np.nan, inplace=True) df_clonetypes.sort_values(["barcode"] + cols, ascending=False, na_position='last', inplace=True) df_clonetypes.replace(np.nan, 'NA', inplace=True) total_CDR3_barcode_number = sum(df_clonetypes.barcode) df_clonetypes["percent"] = df_clonetypes.barcode / \ total_CDR3_barcode_number * 100 df_clonetypes["percent"] = df_clonetypes["percent"].apply( lambda x: round(x, 2)) # add clonetype ID df_clonetypes = df_clonetypes.reset_index() df_clonetypes["clonetype_ID"] = pd.Series(df_clonetypes.index) + 1 df_clonetypes.drop(columns=["index"], inplace=True) # order order = ["clonetype_ID"] + cols + ["barcode", "percent"] df_clonetypes = df_clonetypes[order] df_clonetypes.rename(columns={"barcode": "barcode_count"}, inplace=True) # out clonetypes df_clonetypes.to_csv(clonetypes_file, sep="\t", index=False) if type == "TCR": UMI_col_dic = {"TRA": "UMI_TRA", "TRB": "UMI_TRB"} for chain in UMI_col_dic: UMI_col_name = UMI_col_dic[chain] if UMI_col_name in df_cell_confident_count.columns: df_cell_confident_count[UMI_col_name].replace("NA", 0, inplace=True) Median_chain_UMIs_per_Cell = np.median( df_cell_confident_count[UMI_col_name]) else: Median_chain_UMIs_per_Cell = 0 cell_summary_row_list.append({ "item": "Median {chain} UMIs per Cell".format(chain=chain), "count": Median_chain_UMIs_per_Cell, "total_count": np.nan }) df_TRA_TRB = df_cell_confident_count[ (df_cell_confident_count.aaSeqCDR3_TRA != "NA") & (df_cell_confident_count.aaSeqCDR3_TRB != "NA")] cell_with_confident_TRA_and_TRB = df_TRA_TRB.shape[0] cell_summary_row_list.append({ "item": "Cell with TRA and TRB", "count": cell_with_confident_TRA_and_TRB, "total_count": cell_number, }) """ df cell barcode filter intersect cell_barcodes from scRNA-Seq with barcode from TCR seq """ if match_bool: cell_with_match_barcode = match_cell_barcodes.intersection( cell_barcodes) cell_with_match_barcode_number = len(cell_with_match_barcode) df_match = df_cell_confident_count[ df_cell_confident_count.barcode.isin(match_cell_barcodes)] df_match_TRA_TRB = df_match[(df_match.aaSeqCDR3_TRA != "NA") & (df_match.aaSeqCDR3_TRB != "NA")] match_cell_with_TRA_and_TRB = df_match_TRA_TRB.shape[0] cell_summary_row_list.append({ "item": "Cell with Barcode Match", "count": cell_with_match_barcode_number, "total_count": cell_number, }) cell_summary_row_list.append({ "item": "Cell with Barcode Match, TRA and TRB", "count": match_cell_with_TRA_and_TRB, "total_count": cell_number, }) # BCR elif type == "BCR": UMI_col_dic = {"IGH": "UMI_IGH", "IGL": "UMI_IGL", "IGK": "UMI_IGK"} for chain in UMI_col_dic: UMI_col_name = UMI_col_dic[chain] if UMI_col_name in df_cell_confident_count.columns: df_cell_confident_count[UMI_col_name].replace("NA", 0, inplace=True) df_cell_confident_count_over_zero = df_cell_confident_count[ df_cell_confident_count[UMI_col_name] > 0] Median_chain_UMIs_per_Cell = np.median( df_cell_confident_count_over_zero[UMI_col_name]) else: Median_chain_UMIs_per_Cell = 0 cell_summary_row_list.append({ "item": "Median {chain} UMIs per Cell".format(chain=chain), "count": Median_chain_UMIs_per_Cell, "total_count": np.nan }) df_heavy_and_light = df_cell_confident_count[ (df_cell_confident_count.aaSeqCDR3_IGH != "NA") & ((df_cell_confident_count.aaSeqCDR3_IGL != "NA") | (df_cell_confident_count.aaSeqCDR3_IGK != "NA"))] Cell_with_Heavy_and_Light_Chain = df_heavy_and_light.shape[0] cell_summary_row_list.append({ "item": "Cell with Heavy and Light Chain", "count": Cell_with_Heavy_and_Light_Chain, "total_count": cell_number }) """ df cell barcode filter intersect cell_barcodes from normal scRNA-Seq with barcode from BCR seq """ if match_bool: cell_with_match_barcode = match_cell_barcodes.intersection( cell_barcodes) cell_with_match_barcode_number = len(cell_with_match_barcode) df_match = df_cell_confident_count[ df_cell_confident_count.barcode.isin(match_cell_barcodes)] # median match UMI df_match_heavy_light = df_match[ (df_match.aaSeqCDR3_IGH != "NA") & ((df_match.aaSeqCDR3_IGL != "NA") | (df_match.aaSeqCDR3_IGK != "NA"))] match_cell_with_heavy_and_light = df_match_heavy_light.shape[0] cell_summary_row_list.append({ "item": "Cell with Barcode Match ", "count": cell_with_match_barcode_number, "total_count": cell_number }) cell_summary_row_list.append({ "item": "Cell with Barcode Match, Heavy and Light Chain", "count": match_cell_with_heavy_and_light, "total_count": cell_number }) if match_bool: """ df_match_clonetypes """ df_match_clonetypes = df_match.groupby(cols, as_index=False).agg( {"barcode": "count"}) total_match_CDR3_barcode_number = sum(df_match_clonetypes.barcode) df_match_clonetypes["percent"] = df_match_clonetypes.barcode / \ total_match_CDR3_barcode_number * 100 df_match_clonetypes["percent"] = df_match_clonetypes["percent"].apply( lambda x: round(x, 2)) df_match_clonetypes.rename(columns={"barcode": "barcode_count"}, inplace=True) df_match_clonetypes = df_match_clonetypes.merge(df_clonetypes, on=cols, how='left', suffixes=('', '_y')) # order and drop duplicated cols order = ["clonetype_ID"] + cols + ["barcode_count", "percent"] df_match_clonetypes = df_match_clonetypes[order] df_match_clonetypes.sort_values(["barcode_count", "clonetype_ID"], ascending=[False, True], inplace=True) df_match_clonetypes.to_csv(match_clonetypes_file, sep="\t", index=False) df_mergeID = pd.merge(df_cell_confident_count, df_clonetypes, how="left", on=cols) df_mergeID.sort_values(["clonetype_ID", "barcode"], inplace=True) # output df_cell_confident_count df_mergeID.to_csv(cell_confident_count_file, sep="\t", index=False) df_mergeID = df_mergeID[["barcode", "clonetype_ID"]] df_cell_confident_with_ID = pd.merge(df_cell_confident, df_mergeID, how="left", on="barcode") df_cell_confident_with_ID.sort_values(["clonetype_ID", "barcode", "chain"], inplace=True) # output df_cell_confident df_cell_confident_with_ID.to_csv(cell_confident_file, sep="\t", index=False) # summary file cell_summary = pd.DataFrame(cell_summary_row_list, columns=["item", "count", "total_count"]) cell_summary["count"] = cell_summary["count"].apply(int) cell_summary["percent"] = cell_summary["count"] / \ (cell_summary.total_count.astype("float")) * 100 cell_summary["percent"] = cell_summary["percent"].apply( lambda x: round(x, 2)) cell_summary["count"] = cell_summary["count"].apply(format_number) def percent_str_func(row): need_percent = bool( re.search("Cell with", row["item"], flags=re.IGNORECASE)) if need_percent: return "(" + str(row["percent"]) + "%)" else: return "" cell_summary["percent_str"] = cell_summary.apply( lambda row: percent_str_func(row), axis=1) # stat file def gen_stat(summary, stat_file): stat = summary stat["new_count"] = stat["count"].astype(str) + stat["percent_str"] stat = stat.loc[:, ["item", "new_count"]] stat.to_csv(stat_file, sep=":", header=None, index=False) cell_stat_file = "{}/stat.txt".format(outdir) gen_stat(cell_summary, cell_stat_file) name = type + '_count_vdj' t = reporter( name=name, sample=args.sample, stat_file=cell_stat_file, outdir=outdir + '/..', assay=args.assay, parameters={"iUMI": iUMI}, ) t.get_report() # cloneytpes table def format_table(df_clonetypes, top10_clonetypes_file): top10_clonetypes_df = df_clonetypes.head(10) top10_clonetypes_df = top10_clonetypes_df.reset_index(drop=True) top10_clonetypes_df.index = top10_clonetypes_df.index + 1 top10_clonetypes_df["percent"] = top10_clonetypes_df["percent"].apply( lambda x: str(x) + "%") seqs = ["aaSeqCDR3"] cols = [] for chain in chains: for seq in seqs: cols.append("_".join([seq, chain])) top10_cols = ["clonetype_ID"] + cols + ["barcode_count", "percent"] top10_clonetypes_df = top10_clonetypes_df[top10_cols] top10_clonetypes_df.to_csv(top10_clonetypes_file, sep="\t", index=False) table_header = ["Clonetype_ID"] + cols + ["Frequency", "Percent"] return table_header table_header = format_table(df_clonetypes, top10_clonetypes_file) use_top10_clonetypes_file = top10_clonetypes_file section_header = 'Top10 clonetypes' if match_bool: format_table(df_match_clonetypes, match_top10_clonetypes_file) use_top10_clonetypes_file = match_top10_clonetypes_file section_header = 'Match Top10 clonetypes' t = reporter( name="clonetypes", sample=args.sample, table_file=use_top10_clonetypes_file, table_header=table_header, outdir=outdir + '/..', assay=args.assay, parameters={'section_header': section_header}, ) t.get_report() # other_metrics_file """
def test_Barcode_index(self): barcodes, _nCell = read_barcode_file(self.match_dir) bi = Barcode_index(barcodes) bi.write_index('test_bi.tsv')
def count_fusion(args): outdir = args.outdir sample = args.sample bam = args.bam flanking_base = int(args.flanking_base) fusion_pos_file = args.fusion_pos match_dir = args.match_dir UMI_min = int(args.UMI_min) # check dir if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) fusion_pos = read_pos(fusion_pos_file) out_prefix = outdir + "/" + sample # barcode match_barcode, _n_barcode = read_barcode_file(match_dir) # tsne match_tsne_file = parse_match_dir(match_dir)['tsne_coord'] df_tsne = pd.read_csv(match_tsne_file, sep="\t", index_col=0) # out out_read_count_file = out_prefix + "_fusion_read_count.tsv" out_umi_count_file = out_prefix + "_fusion_UMI_count.tsv" out_barcode_count_file = out_prefix + "_fusion_barcode_count.tsv" out_tsne_file = out_prefix + "_fusion_tsne.tsv" # process bam samfile = pysam.AlignmentFile(bam, "rb") header = samfile.header new_bam = pysam.AlignmentFile(out_prefix + "_fusion.bam", "wb", header=header) count_dic = genDict(dim=3) for read in samfile: tag = read.reference_name read_start = int(read.reference_start) read_length = len(read.query_sequence) attr = read.query_name.split('_') barcode = attr[0] umi = attr[1] if tag in fusion_pos.keys(): if barcode in match_barcode: if is_fusion(pos=fusion_pos[tag], read_start=read_start, read_length=read_length, flanking_base=flanking_base): new_bam.write(read) count_dic[barcode][tag][umi] += 1 new_bam.close() # write dic to pandas df rows = [] for barcode in count_dic: for tag in count_dic[barcode]: for umi in count_dic[barcode][tag]: rows.append([barcode, tag, umi, count_dic[barcode][tag][umi]]) df_read = pd.DataFrame(rows) df_read.rename(columns={ 0: "barcode", 1: "tag", 2: "UMI", 3: "read_count" }, inplace=True) df_read.to_csv(out_read_count_file, sep="\t", index=False) if not rows: count_fusion.logger.error('***** NO FUSION FOUND! *****') else: df_umi = df_read.groupby(["barcode", "tag"]).agg({"UMI": "count"}) df_umi = df_umi[df_umi["UMI"] >= UMI_min] df_umi.to_csv(out_umi_count_file, sep="\t") df_umi.reset_index(inplace=True) df_barcode = df_umi.groupby(["tag"]).agg({"barcode": "count"}) n_match_barcode = len(match_barcode) # add zero count tag for tag in fusion_pos.keys(): if not tag in df_barcode.barcode: new_row = pd.Series(data={'barcode': 0}, name=tag) df_barcode = df_barcode.append(new_row, ignore_index=False) df_barcode["percent"] = df_barcode["barcode"] / n_match_barcode df_barcode.to_csv(out_barcode_count_file, sep="\t") df_pivot = df_umi.pivot(index="barcode", columns="tag", values="UMI") df_pivot.fillna(0, inplace=True) df_tsne_fusion = pd.merge(df_tsne, df_pivot, right_index=True, left_index=True, how="left") df_tsne_fusion.fillna(0, inplace=True) df_tsne_fusion.to_csv(out_tsne_file, sep="\t") # plot count_fusion.logger.info("plot fusion...!") app = fusionDir + "/plot_fusion.R" cmd = f"Rscript {app} --tsne_fusion {out_tsne_file} --outdir {outdir}" os.system(cmd) count_fusion.logger.info("plot done.")
def count_smk(args): read_file = args.read_file match_dir = args.match_dir tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0] UMI_min = args.UMI_min SNR_min = args.SNR_min dim = int(args.dim) combine_cluster = args.combine_cluster outdir = args.outdir sample = args.sample assay = args.assay if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) # stat_row stats = pd.Series() # process match_barcode, cell_total = read_barcode_file(match_dir) UMI_tag_file = f'{outdir}/{sample}_umi_tag.tsv' tsne_tag_file = f'{outdir}/{sample}_tsne_tag.tsv' cluster_count_file = f'{outdir}/{sample}_cluster_count.tsv' cluster_plot = f'{outdir}/{sample}_cluster_plot.pdf' if combine_cluster: combine_cluster_count_file = f'{outdir}/{sample}_combine_cluster_count.tsv' combine_cluster_plot = f'{outdir}/{sample}_combine_cluster_plot.pdf' df_read_count = pd.read_csv(read_file, sep="\t", index_col=0) mapped_read = df_read_count['read_count'].sum() # in cell df_read_count_in_cell = df_read_count[df_read_count.index.isin( match_barcode)] mapped_read_in_cell = int(df_read_count_in_cell['read_count'].sum()) stats = stats.append( pd.Series(format_stat(mapped_read_in_cell, mapped_read), index=['Mapped Reads in Cells'])) # UMI df_UMI_in_cell = df_read_count_in_cell.reset_index().groupby( ['barcode', 'SMK_barcode_name']).agg({'UMI': 'count'}) df_UMI_in_cell = df_UMI_in_cell.reset_index() df_UMI_in_cell = df_UMI_in_cell.pivot(index='barcode', columns='SMK_barcode_name', values='UMI') df_cell = pd.DataFrame(index=match_barcode) df_UMI_cell = pd.merge(df_cell, df_UMI_in_cell, how="left", left_index=True, right_index=True) # fillna df_UMI_cell.fillna(0, inplace=True) df_UMI_cell = df_UMI_cell.astype(int) # UMI UMIs = df_UMI_cell.apply(sum, axis=1) median = round(np.median(UMIs), 2) mean = round(np.mean(UMIs), 2) stats = stats.append(pd.Series(str(median), index=['Median UMI per Cell'])) stats = stats.append(pd.Series(str(mean), index=['Mean UMI per Cell'])) UMI_min = get_UMI_min(df_UMI_cell, UMI_min) count_smk.logger.info(f'UMI_min: {UMI_min}') SNR_min = get_SNR_min(df_UMI_cell, dim, SNR_min, UMI_min) count_smk.logger.info(f'SNR_min: {SNR_min}') df_UMI_cell["tag"] = df_UMI_cell.apply(tag_type, UMI_min=UMI_min, SNR_min=SNR_min, dim=dim, axis=1) df_UMI_cell.to_csv(UMI_tag_file, sep="\t") df_tsne = pd.read_csv(tsne_file, sep="\t", index_col=0) df_tsne_tag = pd.merge(df_tsne, df_UMI_cell, how="left", left_index=True, right_index=True) if combine_cluster: df_combine_cluster = pd.read_csv(combine_cluster, sep="\t", header=None) df_combine_cluster.columns = ["cluster", "combine_cluster"] df_tsne_combine_cluster_tag = pd.merge(df_tsne_tag, df_combine_cluster, on=["cluster"], how="left", left_index=True).set_index( df_tsne_tag.index) df_tsne_combine_cluster_tag.to_csv(tsne_tag_file, sep="\t") else: df_tsne_tag.to_csv(tsne_tag_file, sep="\t") write_and_plot(df=df_tsne_tag, column_name="cluster", count_file=cluster_count_file, plot_file=cluster_plot) if combine_cluster: write_and_plot(df=df_tsne_combine_cluster_tag, column_name="combine_cluster", count_file=combine_cluster_count_file, plot_file=combine_cluster_plot) df_tag_count = df_UMI_cell["tag"].value_counts().reset_index() df_tag_count.columns = ["item", "count"] for index, row in df_tag_count.iterrows(): stats = stats.append( pd.Series(format_stat(row['count'], cell_total), index=[row['item'] + ' Cells'])) stat_file = f'{outdir}/stat.txt' stats.to_csv(stat_file, sep=':', header=False) t = reporter(name='count_smk', assay=assay, sample=sample, stat_file=stat_file, outdir=outdir + '/..') t.get_report()