Ejemplo n.º 1
0
def matrix_10X(df,
               outdir,
               sample,
               gtf_file,
               dir_name='matrix_10X',
               validated_barcodes=None):
    matrix_10X_dir = f"{outdir}/{sample}_{dir_name}/"
    if not os.path.exists(matrix_10X_dir):
        os.mkdir(matrix_10X_dir)
    id_name = gene_convert(gtf_file)

    if validated_barcodes:
        df = df.loc[df['Barcode'].isin(validated_barcodes), :]

    df_UMI = df.groupby(['geneID', 'Barcode']).agg({'UMI': 'count'})
    mtx = coo_matrix(
        (df_UMI.UMI, (df_UMI.index.labels[0], df_UMI.index.labels[1])))
    id = df_UMI.index.levels[0].to_series()
    # add gene symbol
    name = id.apply(lambda x: id_name[x])
    genes = pd.concat([id, name], axis=1)
    genes.columns = ['gene_id', 'gene_name']

    barcodes = df_UMI.index.levels[1].to_series()
    genes.to_csv(f'{matrix_10X_dir}/genes.tsv',
                 index=False,
                 sep='\t',
                 header=False)
    barcodes.to_csv(f'{matrix_10X_dir}/barcodes.tsv', index=False, sep='\t')
    mmwrite(f'{matrix_10X_dir}/matrix', mtx)
Ejemplo n.º 2
0
def count_capture_rna(args):

    # check
    _refFlat, gtf = glob_genomeDir(args.genomeDir)
    id_name = gene_convert(gtf)

    # 检查和创建输出目录
    if not os.path.exists(args.outdir):
        os.system('mkdir -p %s' % (args.outdir))

    # umi纠错,输出Barcode geneID  UMI     count为表头的表格
    count_detail_file = args.outdir + '/' + args.sample + '_count_detail.txt'
    df_probe = bam2table(args.bam, count_detail_file, id_name)
    df_probe.to_csv(f'{args.outdir}/{args.sample}_probe_gene_count.tsv',
                    sep='\t',
                    index=False)

    df = pd.read_table(count_detail_file, header=0)

    # call cells
    pdf = args.outdir + '/barcode_filter_magnitude.pdf'
    marked_counts_file = args.outdir + '/' + args.sample + '_counts.txt'
    (validated_barcodes, threshold, cell_num,
     CB_describe) = call_cells(df, args.cells, pdf, marked_counts_file)

    # match barcode
    sc_cell_barcodes, sc_cell_number = read_barcode_file(args.match_dir)

    # 输出matrix
    (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome,
     match_cell_str,
     match_UMI_median) = expression_matrix(df, validated_barcodes, args.outdir,
                                           args.sample, id_name,
                                           sc_cell_barcodes, sc_cell_number)

    # downsampling
    validated_barcodes = set(validated_barcodes)
    downsample_file = args.outdir + '/' + args.sample + '_downsample.txt'
    Saturation = downsample(count_detail_file, validated_barcodes,
                            downsample_file)

    # summary
    stat_file = args.outdir + '/stat.txt'
    get_summary(df, args.sample, Saturation, CB_describe, CB_total_Genes,
                CB_reads_count, reads_mapped_to_transcriptome, match_cell_str,
                match_UMI_median, stat_file, args.outdir + '/../')

    report_prepare(marked_counts_file, downsample_file, args.outdir + '/..')

    t = reporter(assay=args.assay,
                 name='count_capture_rna',
                 sample=args.sample,
                 stat_file=args.outdir + '/stat.txt',
                 outdir=args.outdir + '/..')
    t.get_report()
Ejemplo n.º 3
0
def generate_matrix(gtf_file, matrix_file):

    id_name = gene_convert(gtf_file)
    matrix = pd.read_csv(matrix_file, sep="\t")

    gene_name_col = matrix.geneID.apply(lambda x: id_name[x])
    matrix.geneID = gene_name_col
    matrix = matrix.drop_duplicates(subset=["geneID"], keep="first")
    matrix = matrix.dropna()
    matrix = matrix.rename({"geneID": ""}, axis='columns')
    return matrix
Ejemplo n.º 4
0
def convert(gene_list_file, gtf):
    gene_list_name, _count = read_one_col(gene_list_file)
    id_name = gene_convert(gtf)
    name_id = {}
    for id in id_name:
        name = id_name[id]
        name_id[name] = id
    gene_id_name_dic = {}
    for gene_name in gene_list_name:
        gene_id = name_id[gene_name]
        gene_id_name_dic[gene_id] = gene_name
    return gene_id_name_dic
Ejemplo n.º 5
0
    def test_gtf(self):
        '''
        gtf_file = '/SGRNJ/Database/script/genome/hs/gtf/Homo_sapiens.GRCh38.99.gtf'
        id_name = gene_convert(gtf_file)
        print(f"ENSG00000001629: {id_name['ENSG00000001629']}")

        gtf_file = '/SGRNJ01/RD_dir/pipeline_test/litao/genomes/Cricetulus_griseus/Cricetulus_griseus_crigri.CriGri_1.0.101.gtf'
        id_name = gene_convert(gtf_file)
        print(id_name)
        '''

        gtf_file = '/Public/Database/genome/Sus_scrofa/ncbi/GCF_000003025.6_Sscrofa11.1_genomic_new.gtf'
        id_name = gene_convert(gtf_file)
        print(id_name['tRNA-Asp'])
Ejemplo n.º 6
0
def expression_matrix(df, validated_barcodes, outdir, sample, gtf_file):

    matrix_10X_dir = f"{outdir}/{sample}_matrix_10X/"
    matrix_table_file = f"{outdir}/{sample}_matrix.tsv.gz"
    if not os.path.exists(matrix_10X_dir):
        os.mkdir(matrix_10X_dir)

    df.loc[:, 'mark'] = 'UB'
    df.loc[df['Barcode'].isin(validated_barcodes), 'mark'] = 'CB'

    CB_total_Genes = df.loc[df['mark'] == 'CB', 'geneID'].nunique()
    CB_reads_count = df.loc[df['mark'] == 'CB', 'count'].sum()
    reads_mapped_to_transcriptome = df['count'].sum()

    table = df.loc[df['mark'] == 'CB', :].pivot_table(
        index='geneID', columns='Barcode', values='UMI',
        aggfunc=len).fillna(0).astype(int)

    id_name = gene_convert(gtf_file)
    id = table.index.to_series()
    name = id.apply(lambda x: id_name[x])
    genes = pd.concat([id, name], axis=1)
    genes.columns = ['gene_id', 'gene_name']

    # write 10X matrix
    table.columns.to_series().to_csv(f'{matrix_10X_dir}/barcodes.tsv',
                                     index=False,
                                     sep='\t')
    genes.to_csv(f'{matrix_10X_dir}/genes.tsv',
                 index=False,
                 header=False,
                 sep='\t')
    mmwrite(f'{matrix_10X_dir}/matrix', csr_matrix(table))

    # convert id to name; write table matrix
    table.index = name
    table.index.name = ""
    table.to_csv(matrix_table_file, sep="\t", compression='gzip')

    return (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome)
Ejemplo n.º 7
0
def expression_matrix(df, validated_barcodes, outdir, sample, gtf_file):

    id_name = gene_convert(gtf_file)

    df.loc[:, 'mark'] = 'UB'
    df.loc[df['Barcode'].isin(validated_barcodes), 'mark'] = 'CB'
    CB_total_Genes = df.loc[df['mark'] == 'CB', 'geneID'].nunique()
    CB_reads_count = df.loc[df['mark'] == 'CB', 'count'].sum()
    reads_mapped_to_transcriptome = df['count'].sum()

    table = df.loc[df['mark'] == 'CB', :].pivot_table(
        index='geneID', columns='Barcode', values='UMI',
        aggfunc=len).fillna(0).astype(int)

    # convert id to name; write table matrix
    matrix_table_file = f"{outdir}/{sample}_matrix.tsv.gz"
    id = table.index.to_series()
    name = id.apply(lambda x: id_name[x])
    table.index = name
    table.index.name = ""
    table.to_csv(matrix_table_file, sep="\t", compression='gzip')
    return (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome)