def matrix_10X(df, outdir, sample, gtf_file, dir_name='matrix_10X', validated_barcodes=None): matrix_10X_dir = f"{outdir}/{sample}_{dir_name}/" if not os.path.exists(matrix_10X_dir): os.mkdir(matrix_10X_dir) id_name = gene_convert(gtf_file) if validated_barcodes: df = df.loc[df['Barcode'].isin(validated_barcodes), :] df_UMI = df.groupby(['geneID', 'Barcode']).agg({'UMI': 'count'}) mtx = coo_matrix( (df_UMI.UMI, (df_UMI.index.labels[0], df_UMI.index.labels[1]))) id = df_UMI.index.levels[0].to_series() # add gene symbol name = id.apply(lambda x: id_name[x]) genes = pd.concat([id, name], axis=1) genes.columns = ['gene_id', 'gene_name'] barcodes = df_UMI.index.levels[1].to_series() genes.to_csv(f'{matrix_10X_dir}/genes.tsv', index=False, sep='\t', header=False) barcodes.to_csv(f'{matrix_10X_dir}/barcodes.tsv', index=False, sep='\t') mmwrite(f'{matrix_10X_dir}/matrix', mtx)
def count_capture_rna(args): # check _refFlat, gtf = glob_genomeDir(args.genomeDir) id_name = gene_convert(gtf) # 检查和创建输出目录 if not os.path.exists(args.outdir): os.system('mkdir -p %s' % (args.outdir)) # umi纠错,输出Barcode geneID UMI count为表头的表格 count_detail_file = args.outdir + '/' + args.sample + '_count_detail.txt' df_probe = bam2table(args.bam, count_detail_file, id_name) df_probe.to_csv(f'{args.outdir}/{args.sample}_probe_gene_count.tsv', sep='\t', index=False) df = pd.read_table(count_detail_file, header=0) # call cells pdf = args.outdir + '/barcode_filter_magnitude.pdf' marked_counts_file = args.outdir + '/' + args.sample + '_counts.txt' (validated_barcodes, threshold, cell_num, CB_describe) = call_cells(df, args.cells, pdf, marked_counts_file) # match barcode sc_cell_barcodes, sc_cell_number = read_barcode_file(args.match_dir) # 输出matrix (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome, match_cell_str, match_UMI_median) = expression_matrix(df, validated_barcodes, args.outdir, args.sample, id_name, sc_cell_barcodes, sc_cell_number) # downsampling validated_barcodes = set(validated_barcodes) downsample_file = args.outdir + '/' + args.sample + '_downsample.txt' Saturation = downsample(count_detail_file, validated_barcodes, downsample_file) # summary stat_file = args.outdir + '/stat.txt' get_summary(df, args.sample, Saturation, CB_describe, CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome, match_cell_str, match_UMI_median, stat_file, args.outdir + '/../') report_prepare(marked_counts_file, downsample_file, args.outdir + '/..') t = reporter(assay=args.assay, name='count_capture_rna', sample=args.sample, stat_file=args.outdir + '/stat.txt', outdir=args.outdir + '/..') t.get_report()
def generate_matrix(gtf_file, matrix_file): id_name = gene_convert(gtf_file) matrix = pd.read_csv(matrix_file, sep="\t") gene_name_col = matrix.geneID.apply(lambda x: id_name[x]) matrix.geneID = gene_name_col matrix = matrix.drop_duplicates(subset=["geneID"], keep="first") matrix = matrix.dropna() matrix = matrix.rename({"geneID": ""}, axis='columns') return matrix
def convert(gene_list_file, gtf): gene_list_name, _count = read_one_col(gene_list_file) id_name = gene_convert(gtf) name_id = {} for id in id_name: name = id_name[id] name_id[name] = id gene_id_name_dic = {} for gene_name in gene_list_name: gene_id = name_id[gene_name] gene_id_name_dic[gene_id] = gene_name return gene_id_name_dic
def test_gtf(self): ''' gtf_file = '/SGRNJ/Database/script/genome/hs/gtf/Homo_sapiens.GRCh38.99.gtf' id_name = gene_convert(gtf_file) print(f"ENSG00000001629: {id_name['ENSG00000001629']}") gtf_file = '/SGRNJ01/RD_dir/pipeline_test/litao/genomes/Cricetulus_griseus/Cricetulus_griseus_crigri.CriGri_1.0.101.gtf' id_name = gene_convert(gtf_file) print(id_name) ''' gtf_file = '/Public/Database/genome/Sus_scrofa/ncbi/GCF_000003025.6_Sscrofa11.1_genomic_new.gtf' id_name = gene_convert(gtf_file) print(id_name['tRNA-Asp'])
def expression_matrix(df, validated_barcodes, outdir, sample, gtf_file): matrix_10X_dir = f"{outdir}/{sample}_matrix_10X/" matrix_table_file = f"{outdir}/{sample}_matrix.tsv.gz" if not os.path.exists(matrix_10X_dir): os.mkdir(matrix_10X_dir) df.loc[:, 'mark'] = 'UB' df.loc[df['Barcode'].isin(validated_barcodes), 'mark'] = 'CB' CB_total_Genes = df.loc[df['mark'] == 'CB', 'geneID'].nunique() CB_reads_count = df.loc[df['mark'] == 'CB', 'count'].sum() reads_mapped_to_transcriptome = df['count'].sum() table = df.loc[df['mark'] == 'CB', :].pivot_table( index='geneID', columns='Barcode', values='UMI', aggfunc=len).fillna(0).astype(int) id_name = gene_convert(gtf_file) id = table.index.to_series() name = id.apply(lambda x: id_name[x]) genes = pd.concat([id, name], axis=1) genes.columns = ['gene_id', 'gene_name'] # write 10X matrix table.columns.to_series().to_csv(f'{matrix_10X_dir}/barcodes.tsv', index=False, sep='\t') genes.to_csv(f'{matrix_10X_dir}/genes.tsv', index=False, header=False, sep='\t') mmwrite(f'{matrix_10X_dir}/matrix', csr_matrix(table)) # convert id to name; write table matrix table.index = name table.index.name = "" table.to_csv(matrix_table_file, sep="\t", compression='gzip') return (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome)
def expression_matrix(df, validated_barcodes, outdir, sample, gtf_file): id_name = gene_convert(gtf_file) df.loc[:, 'mark'] = 'UB' df.loc[df['Barcode'].isin(validated_barcodes), 'mark'] = 'CB' CB_total_Genes = df.loc[df['mark'] == 'CB', 'geneID'].nunique() CB_reads_count = df.loc[df['mark'] == 'CB', 'count'].sum() reads_mapped_to_transcriptome = df['count'].sum() table = df.loc[df['mark'] == 'CB', :].pivot_table( index='geneID', columns='Barcode', values='UMI', aggfunc=len).fillna(0).astype(int) # convert id to name; write table matrix matrix_table_file = f"{outdir}/{sample}_matrix.tsv.gz" id = table.index.to_series() name = id.apply(lambda x: id_name[x]) table.index = name table.index.name = "" table.to_csv(matrix_table_file, sep="\t", compression='gzip') return (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome)