Example #1
0
    def run(self):
        stats = pd.Series()
        outdir = self.outdir
        sample = self.sample
        UMI_tag_file = f'{outdir}/{sample}_umi_tag.tsv'
        mapped_read = self.df_read_count['read_count'].sum()

        # in cell
        df_read_count_in_cell = self.df_read_count[
            self.df_read_count.index.isin(self.match_barcode)]
        mapped_read_in_cell = int(df_read_count_in_cell['read_count'].sum())
        stats = stats.append(
            pd.Series(format_stat(mapped_read_in_cell, mapped_read),
                      index=['Mapped Reads in Cells']))

        # UMI
        df_UMI_in_cell = df_read_count_in_cell.reset_index().groupby(
            ['barcode', 'barcode_name']).agg({'UMI': 'count'})
        df_UMI_in_cell = df_UMI_in_cell.reset_index()
        df_UMI_in_cell = df_UMI_in_cell.pivot(index='barcode',
                                              columns='barcode_name',
                                              values='UMI')
        df_cell = pd.DataFrame(index=self.match_barcode)
        df_UMI_cell = pd.merge(df_cell,
                               df_UMI_in_cell,
                               how="left",
                               left_index=True,
                               right_index=True)

        # fillna
        df_UMI_cell.fillna(0, inplace=True)
        df_UMI_cell = df_UMI_cell.astype(int)
        df_UMI_cell_out = df_UMI_cell.T
        df_UMI_cell_out.to_csv(self.mtx, sep='\t', compression='gzip')

        # UMI
        UMIs = df_UMI_cell.apply(sum, axis=1)
        median = round(np.median(UMIs), 2)
        mean = round(np.mean(UMIs), 2)
        stats = stats.append(
            pd.Series(str(median), index=['Median UMI per Cell']))

        stats = stats.append(pd.Series(str(mean), index=['Mean UMI per Cell']))

        self.stats = stats
Example #2
0
def summary(index_file, count_file, outdir, sample):
    # init
    number = 0
    Number_of_Match_Cells_with_SNP = 0
    SNP_count_dict = defaultdict(int)
    coord_gene_dict = defaultdict(dict)

    # read index
    df_index, df_valid = read_index(index_file)

    # out vcf
    out_vcf = open(f'{outdir}/{sample}.vcf', 'wt')
    for index in df_valid.index:
        vcf_coords_dict = {}
        number += 1
        cell_vcf_file = f'{outdir}/cells/cell{index}/cell{index}_norm.vcf'
        # vcf coords
        with open(cell_vcf_file, 'rt') as f:
            for line in f:
                if line.startswith("#"):
                    # add vcf and bam header
                    if number == 1:
                        new_line = process_vcf_header(line, sample)
                        if new_line:
                            out_vcf.write(new_line)
                    continue
                if line:
                    items = line.split('\t')
                    items[7] += f';CELL={index}'
                    new_line = '\t'.join(items)
                    out_vcf.write(new_line)
                    chrom = str(items[0])
                    pos = int(items[1])
                    if chrom not in vcf_coords_dict:
                        vcf_coords_dict[chrom] = set([pos])
                    else:
                        vcf_coords_dict[chrom].add(pos)
                    SNP_count_dict[index] += 1

        # add bam header
        if number == 1:
            cell_bam_file = f'{outdir}/cells/cell{index}/cell{index}_sorted.bam'
            cell_bam = pysam.AlignmentFile(cell_bam_file, "rb")
            header = cell_bam.header
            out_bam = pysam.AlignmentFile(f'{outdir}/{sample}.bam',
                                          "wb",
                                          header=header)

        # add bam
        if len(vcf_coords_dict) > 0:
            Number_of_Match_Cells_with_SNP += 1
            cell_bam_file = f'{outdir}/cells/cell{index}/cell{index}_sorted.bam'
            cell_bam = pysam.AlignmentFile(cell_bam_file, "rb")
            for read in cell_bam:
                bam_ref = str(read.reference_name)
                gene_name = read.get_tag('GN')
                aligned_pairs = read.get_aligned_pairs()
                align_dict = {}
                for pair in aligned_pairs:
                    ref_pos = pair[1]
                    read_pos = pair[0]
                    if ref_pos:
                        align_dict[ref_pos] = read_pos
                if bam_ref in vcf_coords_dict.keys():
                    read_flag = False
                    for pos in vcf_coords_dict[bam_ref]:
                        if pos in align_dict:
                            read_flag = True
                            coord_gene_dict[bam_ref][pos] = gene_name
                    if read_flag:
                        out_bam.write(read)

    out_vcf.close()
    out_bam.close()
    pysam.sort("-o", f'{outdir}/{sample}_sorted.bam', f'{outdir}/{sample}.bam')
    cmd = f'samtools index {outdir}/{sample}_sorted.bam'
    os.system(cmd)

    # annotate vcf
    anno_vcf = open(f'{outdir}/{sample}_anno.vcf', 'wt')
    with open(f'{outdir}/{sample}.vcf', 'rt') as vcf:
        for line in vcf:
            if line.startswith('#'):
                anno_vcf.write(line)
                continue
            items = line.split('\t')
            chrom = str(items[0])
            pos = int(items[1])
            gene_name = coord_gene_dict[chrom][pos]
            items[7] += f';GENE={gene_name}'
            new_line = '\t'.join(items)
            anno_vcf.write(new_line)
    anno_vcf.close()

    # rm
    #os.remove(f'{outdir}/{sample}.vcf')
    #os.remove(f'{outdir}/{sample}.bam')

    # stat
    stats = pd.Series()
    n_match_cell = len(df_index.index)

    df_count = pd.read_csv(count_file, sep='\t')
    df_count_read = df_count.groupby('barcode').agg({'read_count': sum})
    read_total = sum(df_count_read['read_count'])
    Mean_Reads_per_Cell = round((read_total / n_match_cell), 2)
    stats = stats.append(
        pd.Series(Mean_Reads_per_Cell, index=['Mean Reads per Cell']))
    df_count_UMI = df_count.groupby('barcode').agg({'UMI': 'count'})
    UMI_total = sum(df_count_UMI['UMI'])
    Mean_UMIs_per_Cell = round((UMI_total / n_match_cell), 2)
    stats = stats.append(
        pd.Series(Mean_UMIs_per_Cell, index=['Mean UMIs per Cell']))

    stats = stats.append(
        pd.Series(format_stat(Number_of_Match_Cells_with_SNP, n_match_cell),
                  index=['Number of Cells with Variants']))

    SNP_counts = list(SNP_count_dict.values())
    Mean_SNP_per_Cell = round(np.mean(SNP_counts), 3)
    stats = stats.append(
        pd.Series(Mean_SNP_per_Cell,
                  index=['Mean Variants per Cell with Variants']))

    stat_file = f'{outdir}/stat.txt'
    stats.to_csv(stat_file, sep=':', header=False)

    t = reporter(name='snpCalling',
                 assay='snp',
                 sample=sample,
                 stat_file=stat_file,
                 outdir=outdir + '/..')
    t.get_report()
Example #3
0
    def format_stat(self):
        fh1 = open(self.STAR_map_log, 'r')
        UNIQUE_READS = []
        MULTI_MAPPING_READS = []
        for line in fh1:
            if line.strip() == '':
                continue
            if re.search(r'Uniquely mapped reads', line):
                UNIQUE_READS.append(line.strip().split()[-1])
            if re.search(r'of reads mapped to too many loci', line):
                MULTI_MAPPING_READS.append(line.strip().split()[-1])
        fh1.close()

        fh2 = open(self.picard_region_log, 'r')
        region_dict = {}
        while True:
            line = fh2.readline()
            if not line:
                break
            if line.startswith('## METRICS CLASS'):
                header = fh2.readline().strip().split('\t')
                data = fh2.readline().strip().split('\t')
                region_dict = dict(zip(header, data))
                break
        fh2.close()

        Total = float(region_dict['PF_ALIGNED_BASES'])
        Exonic_Regions = int(region_dict['UTR_BASES']) + \
            int(region_dict['CODING_BASES'])
        Intronic_Regions = int(region_dict['INTRONIC_BASES'])
        Intergenic_Regions = int(region_dict['INTERGENIC_BASES'])

        region_dict['Exonic_Regions'] = "{}({:.2%})".format(
            format_number(Exonic_Regions), Exonic_Regions / Total)
        region_dict['Intronic_Regions'] = "{}({:.2%})".format(
            format_number(Intronic_Regions), Intronic_Regions / Total)
        region_dict['Intergenic_Regions'] = "{}({:.2%})".format(
            format_number(Intergenic_Regions), Intergenic_Regions / Total)

        self.stats = self.stats.append(
            pd.Series(
                f'{format_number(int(UNIQUE_READS[0]))}({UNIQUE_READS[1]})',
                index=['Uniquely Mapped Reads']))
        self.stats = self.stats.append(
            pd.Series(
                f'{format_number(int(MULTI_MAPPING_READS[0]))}({MULTI_MAPPING_READS[1]})',
                index=['Multi-Mapped Reads']))
        # ribo
        if self.debug:
            f = open(self.ribo_log, 'r')
            for line in f:
                if line.find('#Matched') != -1:
                    items = line.split()
                    Reads_Mapped_to_rRNA = int(items[1])
                if line.find('#Total') != -1:
                    items = line.split()
                    Reads_Total = int(items[1])

            self.stats = self.stats.append(
                pd.Series(format_stat(Reads_Mapped_to_rRNA, Reads_Total),
                          index=['Reads Mapped to rRNA']))
            f.close()

        self.stats = self.stats.append(
            pd.Series(region_dict['Exonic_Regions'],
                      index=['Base Pairs Mapped to Exonic Regions']))
        self.stats = self.stats.append(
            pd.Series(region_dict['Intronic_Regions'],
                      index=['Base Pairs Mapped to Intronic Regions']))
        self.stats = self.stats.append(
            pd.Series(region_dict['Intergenic_Regions'],
                      index=['Base Pairs Mapped to Intergenic Regions']))
        self.plot = {
            'region_labels':
            ['Exonic Regions', 'Intronic Regions', 'Intergenic Regions'],
            'region_values':
            [Exonic_Regions, Intronic_Regions, Intergenic_Regions]
        }

        self.stats.to_csv(self.stats_file, sep=':', header=False)
Example #4
0
    def run(self):
        stats = pd.Series()
        outdir = self.outdir
        sample = self.sample
        UMI_tag_file = f'{outdir}/{sample}_umi_tag.tsv'
        tsne_tag_file = f'{outdir}/{sample}_tsne_tag.tsv'
        cluster_count_file = f'{outdir}/{sample}_cluster_count.tsv'
        cluster_plot = f'{outdir}/{sample}_cluster_plot.pdf'
        if self.combine_cluster:
            combine_cluster_count_file = f'{outdir}/{sample}_combine_cluster_count.tsv'
            combine_cluster_plot = f'{outdir}/{sample}_combine_cluster_plot.pdf'

        mapped_read = self.df_read_count['read_count'].sum()

        # in cell
        df_read_count_in_cell = self.df_read_count[self.df_read_count.index.isin(self.match_barcode)]
        mapped_read_in_cell = int(df_read_count_in_cell['read_count'].sum())
        stats = stats.append(pd.Series(
            format_stat(mapped_read_in_cell, mapped_read),
            index=['Mapped Reads in Cells']
        ))

        # UMI
        tag_name = df_read_count_in_cell.columns[0]
        df_UMI_in_cell = df_read_count_in_cell.reset_index().groupby([
            'barcode', tag_name]).agg({'UMI': 'count'})
        df_UMI_in_cell = df_UMI_in_cell.reset_index()
        df_UMI_in_cell = df_UMI_in_cell.pivot(
            index='barcode', columns=tag_name, values='UMI')
        df_cell = pd.DataFrame(index=self.match_barcode)
        df_UMI_cell = pd.merge(
            df_cell,
            df_UMI_in_cell,
            how="left",
            left_index=True,
            right_index=True)

        # fillna
        df_UMI_cell.fillna(0, inplace=True)
        df_UMI_cell = df_UMI_cell.astype(int)

        # UMI
        UMIs = df_UMI_cell.apply(sum, axis=1)
        median = round(np.median(UMIs), 2)
        mean = round(np.mean(UMIs), 2)
        stats = stats.append(pd.Series(
            str(median),
            index=['Median UMI per Cell']
        ))

        stats = stats.append(pd.Series(
            str(mean),
            index=['Mean UMI per Cell']
        ))

        UMI_min = Count_tag.get_UMI_min(df_UMI_cell, self.UMI_min)
        Count_tag.run.logger.info(f'UMI_min: {UMI_min}')
        SNR_min = self.get_SNR_min(df_UMI_cell, self.dim, self.SNR_min, UMI_min)
        Count_tag.run.logger.info(f'SNR_min: {SNR_min}')
        df_UMI_cell["tag"] = df_UMI_cell.apply(
            Count_tag.tag_type, UMI_min=UMI_min, SNR_min=SNR_min, dim=self.dim, no_noise=self.no_noise, axis=1)
        df_UMI_cell.to_csv(UMI_tag_file, sep="\t")

        df_tsne = pd.read_csv(self.tsne_file, sep="\t", index_col=0)
        df_tsne_tag = pd.merge(
            df_tsne,
            df_UMI_cell,
            how="left",
            left_index=True,
            right_index=True)

        if self.combine_cluster:
            df_combine_cluster = pd.read_csv(
                self.combine_cluster, sep="\t", header=None)
            df_combine_cluster.columns = ["cluster", "combine_cluster"]
            df_tsne_combine_cluster_tag = pd.merge(
                df_tsne_tag, df_combine_cluster,
                on=["cluster"], how="left", left_index=True).set_index(df_tsne_tag.index)
            df_tsne_combine_cluster_tag.to_csv(tsne_tag_file, sep="\t")
        else:
            df_tsne_tag.to_csv(tsne_tag_file, sep="\t")

        self.write_and_plot(
            df=df_tsne_tag, column_name="cluster", count_file=cluster_count_file,
            plot_file=cluster_plot
        )

        if self.combine_cluster:
            self.write_and_plot(
                df=df_tsne_combine_cluster_tag,
                column_name="combine_cluster",
                count_file=combine_cluster_count_file,
                plot_file=combine_cluster_plot
            )

        df_tag_count = df_UMI_cell["tag"].value_counts().reset_index()
        df_tag_count.columns = ["item", "count"]
        for index, row in df_tag_count.iterrows():
            stats = stats.append(pd.Series(
                format_stat(row['count'], self.cell_total),
                index=[row['item'] + ' Cells']
        ))
        self.stats = stats
Example #5
0
def count_smk(args):

    read_file = args.read_file
    match_dir = args.match_dir
    tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0]
    UMI_min = args.UMI_min
    SNR_min = args.SNR_min
    dim = int(args.dim)
    combine_cluster = args.combine_cluster
    outdir = args.outdir
    sample = args.sample
    assay = args.assay

    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    # stat_row
    stats = pd.Series()

    # process
    match_barcode, cell_total = read_barcode_file(match_dir)

    UMI_tag_file = f'{outdir}/{sample}_umi_tag.tsv'
    tsne_tag_file = f'{outdir}/{sample}_tsne_tag.tsv'
    cluster_count_file = f'{outdir}/{sample}_cluster_count.tsv'
    cluster_plot = f'{outdir}/{sample}_cluster_plot.pdf'
    if combine_cluster:
        combine_cluster_count_file = f'{outdir}/{sample}_combine_cluster_count.tsv'
        combine_cluster_plot = f'{outdir}/{sample}_combine_cluster_plot.pdf'

    df_read_count = pd.read_csv(read_file, sep="\t", index_col=0)
    mapped_read = df_read_count['read_count'].sum()

    # in cell
    df_read_count_in_cell = df_read_count[df_read_count.index.isin(
        match_barcode)]
    mapped_read_in_cell = int(df_read_count_in_cell['read_count'].sum())
    stats = stats.append(
        pd.Series(format_stat(mapped_read_in_cell, mapped_read),
                  index=['Mapped Reads in Cells']))

    # UMI
    df_UMI_in_cell = df_read_count_in_cell.reset_index().groupby(
        ['barcode', 'SMK_barcode_name']).agg({'UMI': 'count'})
    df_UMI_in_cell = df_UMI_in_cell.reset_index()
    df_UMI_in_cell = df_UMI_in_cell.pivot(index='barcode',
                                          columns='SMK_barcode_name',
                                          values='UMI')
    df_cell = pd.DataFrame(index=match_barcode)
    df_UMI_cell = pd.merge(df_cell,
                           df_UMI_in_cell,
                           how="left",
                           left_index=True,
                           right_index=True)

    # fillna
    df_UMI_cell.fillna(0, inplace=True)
    df_UMI_cell = df_UMI_cell.astype(int)

    # UMI
    UMIs = df_UMI_cell.apply(sum, axis=1)
    median = round(np.median(UMIs), 2)
    mean = round(np.mean(UMIs), 2)
    stats = stats.append(pd.Series(str(median), index=['Median UMI per Cell']))

    stats = stats.append(pd.Series(str(mean), index=['Mean UMI per Cell']))

    UMI_min = get_UMI_min(df_UMI_cell, UMI_min)
    count_smk.logger.info(f'UMI_min: {UMI_min}')
    SNR_min = get_SNR_min(df_UMI_cell, dim, SNR_min, UMI_min)
    count_smk.logger.info(f'SNR_min: {SNR_min}')
    df_UMI_cell["tag"] = df_UMI_cell.apply(tag_type,
                                           UMI_min=UMI_min,
                                           SNR_min=SNR_min,
                                           dim=dim,
                                           axis=1)
    df_UMI_cell.to_csv(UMI_tag_file, sep="\t")

    df_tsne = pd.read_csv(tsne_file, sep="\t", index_col=0)
    df_tsne_tag = pd.merge(df_tsne,
                           df_UMI_cell,
                           how="left",
                           left_index=True,
                           right_index=True)

    if combine_cluster:
        df_combine_cluster = pd.read_csv(combine_cluster,
                                         sep="\t",
                                         header=None)
        df_combine_cluster.columns = ["cluster", "combine_cluster"]
        df_tsne_combine_cluster_tag = pd.merge(df_tsne_tag,
                                               df_combine_cluster,
                                               on=["cluster"],
                                               how="left",
                                               left_index=True).set_index(
                                                   df_tsne_tag.index)
        df_tsne_combine_cluster_tag.to_csv(tsne_tag_file, sep="\t")
    else:
        df_tsne_tag.to_csv(tsne_tag_file, sep="\t")

    write_and_plot(df=df_tsne_tag,
                   column_name="cluster",
                   count_file=cluster_count_file,
                   plot_file=cluster_plot)

    if combine_cluster:
        write_and_plot(df=df_tsne_combine_cluster_tag,
                       column_name="combine_cluster",
                       count_file=combine_cluster_count_file,
                       plot_file=combine_cluster_plot)

    df_tag_count = df_UMI_cell["tag"].value_counts().reset_index()
    df_tag_count.columns = ["item", "count"]
    for index, row in df_tag_count.iterrows():
        stats = stats.append(
            pd.Series(format_stat(row['count'], cell_total),
                      index=[row['item'] + ' Cells']))
    stat_file = f'{outdir}/stat.txt'
    stats.to_csv(stat_file, sep=':', header=False)

    t = reporter(name='count_smk',
                 assay=assay,
                 sample=sample,
                 stat_file=stat_file,
                 outdir=outdir + '/..')
    t.get_report()