def run(self): stats = pd.Series() outdir = self.outdir sample = self.sample UMI_tag_file = f'{outdir}/{sample}_umi_tag.tsv' mapped_read = self.df_read_count['read_count'].sum() # in cell df_read_count_in_cell = self.df_read_count[ self.df_read_count.index.isin(self.match_barcode)] mapped_read_in_cell = int(df_read_count_in_cell['read_count'].sum()) stats = stats.append( pd.Series(format_stat(mapped_read_in_cell, mapped_read), index=['Mapped Reads in Cells'])) # UMI df_UMI_in_cell = df_read_count_in_cell.reset_index().groupby( ['barcode', 'barcode_name']).agg({'UMI': 'count'}) df_UMI_in_cell = df_UMI_in_cell.reset_index() df_UMI_in_cell = df_UMI_in_cell.pivot(index='barcode', columns='barcode_name', values='UMI') df_cell = pd.DataFrame(index=self.match_barcode) df_UMI_cell = pd.merge(df_cell, df_UMI_in_cell, how="left", left_index=True, right_index=True) # fillna df_UMI_cell.fillna(0, inplace=True) df_UMI_cell = df_UMI_cell.astype(int) df_UMI_cell_out = df_UMI_cell.T df_UMI_cell_out.to_csv(self.mtx, sep='\t', compression='gzip') # UMI UMIs = df_UMI_cell.apply(sum, axis=1) median = round(np.median(UMIs), 2) mean = round(np.mean(UMIs), 2) stats = stats.append( pd.Series(str(median), index=['Median UMI per Cell'])) stats = stats.append(pd.Series(str(mean), index=['Mean UMI per Cell'])) self.stats = stats
def summary(index_file, count_file, outdir, sample): # init number = 0 Number_of_Match_Cells_with_SNP = 0 SNP_count_dict = defaultdict(int) coord_gene_dict = defaultdict(dict) # read index df_index, df_valid = read_index(index_file) # out vcf out_vcf = open(f'{outdir}/{sample}.vcf', 'wt') for index in df_valid.index: vcf_coords_dict = {} number += 1 cell_vcf_file = f'{outdir}/cells/cell{index}/cell{index}_norm.vcf' # vcf coords with open(cell_vcf_file, 'rt') as f: for line in f: if line.startswith("#"): # add vcf and bam header if number == 1: new_line = process_vcf_header(line, sample) if new_line: out_vcf.write(new_line) continue if line: items = line.split('\t') items[7] += f';CELL={index}' new_line = '\t'.join(items) out_vcf.write(new_line) chrom = str(items[0]) pos = int(items[1]) if chrom not in vcf_coords_dict: vcf_coords_dict[chrom] = set([pos]) else: vcf_coords_dict[chrom].add(pos) SNP_count_dict[index] += 1 # add bam header if number == 1: cell_bam_file = f'{outdir}/cells/cell{index}/cell{index}_sorted.bam' cell_bam = pysam.AlignmentFile(cell_bam_file, "rb") header = cell_bam.header out_bam = pysam.AlignmentFile(f'{outdir}/{sample}.bam', "wb", header=header) # add bam if len(vcf_coords_dict) > 0: Number_of_Match_Cells_with_SNP += 1 cell_bam_file = f'{outdir}/cells/cell{index}/cell{index}_sorted.bam' cell_bam = pysam.AlignmentFile(cell_bam_file, "rb") for read in cell_bam: bam_ref = str(read.reference_name) gene_name = read.get_tag('GN') aligned_pairs = read.get_aligned_pairs() align_dict = {} for pair in aligned_pairs: ref_pos = pair[1] read_pos = pair[0] if ref_pos: align_dict[ref_pos] = read_pos if bam_ref in vcf_coords_dict.keys(): read_flag = False for pos in vcf_coords_dict[bam_ref]: if pos in align_dict: read_flag = True coord_gene_dict[bam_ref][pos] = gene_name if read_flag: out_bam.write(read) out_vcf.close() out_bam.close() pysam.sort("-o", f'{outdir}/{sample}_sorted.bam', f'{outdir}/{sample}.bam') cmd = f'samtools index {outdir}/{sample}_sorted.bam' os.system(cmd) # annotate vcf anno_vcf = open(f'{outdir}/{sample}_anno.vcf', 'wt') with open(f'{outdir}/{sample}.vcf', 'rt') as vcf: for line in vcf: if line.startswith('#'): anno_vcf.write(line) continue items = line.split('\t') chrom = str(items[0]) pos = int(items[1]) gene_name = coord_gene_dict[chrom][pos] items[7] += f';GENE={gene_name}' new_line = '\t'.join(items) anno_vcf.write(new_line) anno_vcf.close() # rm #os.remove(f'{outdir}/{sample}.vcf') #os.remove(f'{outdir}/{sample}.bam') # stat stats = pd.Series() n_match_cell = len(df_index.index) df_count = pd.read_csv(count_file, sep='\t') df_count_read = df_count.groupby('barcode').agg({'read_count': sum}) read_total = sum(df_count_read['read_count']) Mean_Reads_per_Cell = round((read_total / n_match_cell), 2) stats = stats.append( pd.Series(Mean_Reads_per_Cell, index=['Mean Reads per Cell'])) df_count_UMI = df_count.groupby('barcode').agg({'UMI': 'count'}) UMI_total = sum(df_count_UMI['UMI']) Mean_UMIs_per_Cell = round((UMI_total / n_match_cell), 2) stats = stats.append( pd.Series(Mean_UMIs_per_Cell, index=['Mean UMIs per Cell'])) stats = stats.append( pd.Series(format_stat(Number_of_Match_Cells_with_SNP, n_match_cell), index=['Number of Cells with Variants'])) SNP_counts = list(SNP_count_dict.values()) Mean_SNP_per_Cell = round(np.mean(SNP_counts), 3) stats = stats.append( pd.Series(Mean_SNP_per_Cell, index=['Mean Variants per Cell with Variants'])) stat_file = f'{outdir}/stat.txt' stats.to_csv(stat_file, sep=':', header=False) t = reporter(name='snpCalling', assay='snp', sample=sample, stat_file=stat_file, outdir=outdir + '/..') t.get_report()
def format_stat(self): fh1 = open(self.STAR_map_log, 'r') UNIQUE_READS = [] MULTI_MAPPING_READS = [] for line in fh1: if line.strip() == '': continue if re.search(r'Uniquely mapped reads', line): UNIQUE_READS.append(line.strip().split()[-1]) if re.search(r'of reads mapped to too many loci', line): MULTI_MAPPING_READS.append(line.strip().split()[-1]) fh1.close() fh2 = open(self.picard_region_log, 'r') region_dict = {} while True: line = fh2.readline() if not line: break if line.startswith('## METRICS CLASS'): header = fh2.readline().strip().split('\t') data = fh2.readline().strip().split('\t') region_dict = dict(zip(header, data)) break fh2.close() Total = float(region_dict['PF_ALIGNED_BASES']) Exonic_Regions = int(region_dict['UTR_BASES']) + \ int(region_dict['CODING_BASES']) Intronic_Regions = int(region_dict['INTRONIC_BASES']) Intergenic_Regions = int(region_dict['INTERGENIC_BASES']) region_dict['Exonic_Regions'] = "{}({:.2%})".format( format_number(Exonic_Regions), Exonic_Regions / Total) region_dict['Intronic_Regions'] = "{}({:.2%})".format( format_number(Intronic_Regions), Intronic_Regions / Total) region_dict['Intergenic_Regions'] = "{}({:.2%})".format( format_number(Intergenic_Regions), Intergenic_Regions / Total) self.stats = self.stats.append( pd.Series( f'{format_number(int(UNIQUE_READS[0]))}({UNIQUE_READS[1]})', index=['Uniquely Mapped Reads'])) self.stats = self.stats.append( pd.Series( f'{format_number(int(MULTI_MAPPING_READS[0]))}({MULTI_MAPPING_READS[1]})', index=['Multi-Mapped Reads'])) # ribo if self.debug: f = open(self.ribo_log, 'r') for line in f: if line.find('#Matched') != -1: items = line.split() Reads_Mapped_to_rRNA = int(items[1]) if line.find('#Total') != -1: items = line.split() Reads_Total = int(items[1]) self.stats = self.stats.append( pd.Series(format_stat(Reads_Mapped_to_rRNA, Reads_Total), index=['Reads Mapped to rRNA'])) f.close() self.stats = self.stats.append( pd.Series(region_dict['Exonic_Regions'], index=['Base Pairs Mapped to Exonic Regions'])) self.stats = self.stats.append( pd.Series(region_dict['Intronic_Regions'], index=['Base Pairs Mapped to Intronic Regions'])) self.stats = self.stats.append( pd.Series(region_dict['Intergenic_Regions'], index=['Base Pairs Mapped to Intergenic Regions'])) self.plot = { 'region_labels': ['Exonic Regions', 'Intronic Regions', 'Intergenic Regions'], 'region_values': [Exonic_Regions, Intronic_Regions, Intergenic_Regions] } self.stats.to_csv(self.stats_file, sep=':', header=False)
def run(self): stats = pd.Series() outdir = self.outdir sample = self.sample UMI_tag_file = f'{outdir}/{sample}_umi_tag.tsv' tsne_tag_file = f'{outdir}/{sample}_tsne_tag.tsv' cluster_count_file = f'{outdir}/{sample}_cluster_count.tsv' cluster_plot = f'{outdir}/{sample}_cluster_plot.pdf' if self.combine_cluster: combine_cluster_count_file = f'{outdir}/{sample}_combine_cluster_count.tsv' combine_cluster_plot = f'{outdir}/{sample}_combine_cluster_plot.pdf' mapped_read = self.df_read_count['read_count'].sum() # in cell df_read_count_in_cell = self.df_read_count[self.df_read_count.index.isin(self.match_barcode)] mapped_read_in_cell = int(df_read_count_in_cell['read_count'].sum()) stats = stats.append(pd.Series( format_stat(mapped_read_in_cell, mapped_read), index=['Mapped Reads in Cells'] )) # UMI tag_name = df_read_count_in_cell.columns[0] df_UMI_in_cell = df_read_count_in_cell.reset_index().groupby([ 'barcode', tag_name]).agg({'UMI': 'count'}) df_UMI_in_cell = df_UMI_in_cell.reset_index() df_UMI_in_cell = df_UMI_in_cell.pivot( index='barcode', columns=tag_name, values='UMI') df_cell = pd.DataFrame(index=self.match_barcode) df_UMI_cell = pd.merge( df_cell, df_UMI_in_cell, how="left", left_index=True, right_index=True) # fillna df_UMI_cell.fillna(0, inplace=True) df_UMI_cell = df_UMI_cell.astype(int) # UMI UMIs = df_UMI_cell.apply(sum, axis=1) median = round(np.median(UMIs), 2) mean = round(np.mean(UMIs), 2) stats = stats.append(pd.Series( str(median), index=['Median UMI per Cell'] )) stats = stats.append(pd.Series( str(mean), index=['Mean UMI per Cell'] )) UMI_min = Count_tag.get_UMI_min(df_UMI_cell, self.UMI_min) Count_tag.run.logger.info(f'UMI_min: {UMI_min}') SNR_min = self.get_SNR_min(df_UMI_cell, self.dim, self.SNR_min, UMI_min) Count_tag.run.logger.info(f'SNR_min: {SNR_min}') df_UMI_cell["tag"] = df_UMI_cell.apply( Count_tag.tag_type, UMI_min=UMI_min, SNR_min=SNR_min, dim=self.dim, no_noise=self.no_noise, axis=1) df_UMI_cell.to_csv(UMI_tag_file, sep="\t") df_tsne = pd.read_csv(self.tsne_file, sep="\t", index_col=0) df_tsne_tag = pd.merge( df_tsne, df_UMI_cell, how="left", left_index=True, right_index=True) if self.combine_cluster: df_combine_cluster = pd.read_csv( self.combine_cluster, sep="\t", header=None) df_combine_cluster.columns = ["cluster", "combine_cluster"] df_tsne_combine_cluster_tag = pd.merge( df_tsne_tag, df_combine_cluster, on=["cluster"], how="left", left_index=True).set_index(df_tsne_tag.index) df_tsne_combine_cluster_tag.to_csv(tsne_tag_file, sep="\t") else: df_tsne_tag.to_csv(tsne_tag_file, sep="\t") self.write_and_plot( df=df_tsne_tag, column_name="cluster", count_file=cluster_count_file, plot_file=cluster_plot ) if self.combine_cluster: self.write_and_plot( df=df_tsne_combine_cluster_tag, column_name="combine_cluster", count_file=combine_cluster_count_file, plot_file=combine_cluster_plot ) df_tag_count = df_UMI_cell["tag"].value_counts().reset_index() df_tag_count.columns = ["item", "count"] for index, row in df_tag_count.iterrows(): stats = stats.append(pd.Series( format_stat(row['count'], self.cell_total), index=[row['item'] + ' Cells'] )) self.stats = stats
def count_smk(args): read_file = args.read_file match_dir = args.match_dir tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0] UMI_min = args.UMI_min SNR_min = args.SNR_min dim = int(args.dim) combine_cluster = args.combine_cluster outdir = args.outdir sample = args.sample assay = args.assay if not os.path.exists(outdir): os.system('mkdir -p %s' % (outdir)) # stat_row stats = pd.Series() # process match_barcode, cell_total = read_barcode_file(match_dir) UMI_tag_file = f'{outdir}/{sample}_umi_tag.tsv' tsne_tag_file = f'{outdir}/{sample}_tsne_tag.tsv' cluster_count_file = f'{outdir}/{sample}_cluster_count.tsv' cluster_plot = f'{outdir}/{sample}_cluster_plot.pdf' if combine_cluster: combine_cluster_count_file = f'{outdir}/{sample}_combine_cluster_count.tsv' combine_cluster_plot = f'{outdir}/{sample}_combine_cluster_plot.pdf' df_read_count = pd.read_csv(read_file, sep="\t", index_col=0) mapped_read = df_read_count['read_count'].sum() # in cell df_read_count_in_cell = df_read_count[df_read_count.index.isin( match_barcode)] mapped_read_in_cell = int(df_read_count_in_cell['read_count'].sum()) stats = stats.append( pd.Series(format_stat(mapped_read_in_cell, mapped_read), index=['Mapped Reads in Cells'])) # UMI df_UMI_in_cell = df_read_count_in_cell.reset_index().groupby( ['barcode', 'SMK_barcode_name']).agg({'UMI': 'count'}) df_UMI_in_cell = df_UMI_in_cell.reset_index() df_UMI_in_cell = df_UMI_in_cell.pivot(index='barcode', columns='SMK_barcode_name', values='UMI') df_cell = pd.DataFrame(index=match_barcode) df_UMI_cell = pd.merge(df_cell, df_UMI_in_cell, how="left", left_index=True, right_index=True) # fillna df_UMI_cell.fillna(0, inplace=True) df_UMI_cell = df_UMI_cell.astype(int) # UMI UMIs = df_UMI_cell.apply(sum, axis=1) median = round(np.median(UMIs), 2) mean = round(np.mean(UMIs), 2) stats = stats.append(pd.Series(str(median), index=['Median UMI per Cell'])) stats = stats.append(pd.Series(str(mean), index=['Mean UMI per Cell'])) UMI_min = get_UMI_min(df_UMI_cell, UMI_min) count_smk.logger.info(f'UMI_min: {UMI_min}') SNR_min = get_SNR_min(df_UMI_cell, dim, SNR_min, UMI_min) count_smk.logger.info(f'SNR_min: {SNR_min}') df_UMI_cell["tag"] = df_UMI_cell.apply(tag_type, UMI_min=UMI_min, SNR_min=SNR_min, dim=dim, axis=1) df_UMI_cell.to_csv(UMI_tag_file, sep="\t") df_tsne = pd.read_csv(tsne_file, sep="\t", index_col=0) df_tsne_tag = pd.merge(df_tsne, df_UMI_cell, how="left", left_index=True, right_index=True) if combine_cluster: df_combine_cluster = pd.read_csv(combine_cluster, sep="\t", header=None) df_combine_cluster.columns = ["cluster", "combine_cluster"] df_tsne_combine_cluster_tag = pd.merge(df_tsne_tag, df_combine_cluster, on=["cluster"], how="left", left_index=True).set_index( df_tsne_tag.index) df_tsne_combine_cluster_tag.to_csv(tsne_tag_file, sep="\t") else: df_tsne_tag.to_csv(tsne_tag_file, sep="\t") write_and_plot(df=df_tsne_tag, column_name="cluster", count_file=cluster_count_file, plot_file=cluster_plot) if combine_cluster: write_and_plot(df=df_tsne_combine_cluster_tag, column_name="combine_cluster", count_file=combine_cluster_count_file, plot_file=combine_cluster_plot) df_tag_count = df_UMI_cell["tag"].value_counts().reset_index() df_tag_count.columns = ["item", "count"] for index, row in df_tag_count.iterrows(): stats = stats.append( pd.Series(format_stat(row['count'], cell_total), index=[row['item'] + ' Cells'])) stat_file = f'{outdir}/stat.txt' stats.to_csv(stat_file, sep=':', header=False) t = reporter(name='count_smk', assay=assay, sample=sample, stat_file=stat_file, outdir=outdir + '/..') t.get_report()