def __cir_mols(self, exon_level_MinDepth, min_depth=0): l_brief_name = [ self['sam_info']['samp_brief'][samp] for samp in self['sample'] ] ltype = "circ_mols" inFile = "%s/04.CIRC_info" % (self.statInfo) outFile = "%s/04.CIRC_info" % (self.statInfo) circ_Mols = m_cnt.CountInfo(inFile, l_brief_name, ltype, outFile) circ_Mols.load_mat(exon_level_MinDepth, gen_col=2) circ_Mols.sam_tot_reads() M_gen_len = {} for gene in circ_Mols.gene: M_gen_len[gene] = { 'max_len': 200 } #### Please ONLY USE samples with less index, that sequenced by illumina X10 !!!!!! circ_Mols.cal_RPKM(M_gen_len, self.tophat) out_file_RPKM = "%s/merge.%s.RPKM.xls" % (outFile, ltype) cirRPKM_mat = m_mat.Matrix_info(out_file_RPKM, inf_column=1, in_dtype="float") cirRPKM_mat.load_mat() l_cir_genes = cirRPKM_mat.colname for brief_name in l_brief_name: idx_rowname = cirRPKM_mat.rowname.index(brief_name) self.l_cirRNA_FPKM[brief_name] = cirRPKM_mat.matrix[:, idx_rowname] self.__get_cirRNA_MOLs( ) # get cirRNA mols using ERCC_FPKM, ERCC_MOLs and cirRNA_FPKM
def __load_Count(self): # for ERCC count Stat l_breif_samp = [ self['sam_info']['samp_brief'][samp] for samp in self['sample'] ] genome_gtf = self['infile']['anno_file'] Gtf_Info = m_gtf.GTFFeature( genome_gtf ) ERCC_info = m_cnt.CountInfo( self.HTS_k,l_breif_samp,"dexseq_ERCC_RGCPloyA",self.HTS ) ERCC_info.load_mat() ERCC_info.cal_RPKM( Gtf_Info.gene,self.tophat )
def RPKM_novo_trans(self): l_brief_samp = [ "%s" % ( self['samp_info']['samp_brief'][samp] ) for samp in self['samp'] ] unknown_GTF = "%s/novo_lnc_raw.combined.gtf" % ( self.data_dir ) Gtf_Info = m_gtf.GTFFeature( unknown_GTF ) Gtf_Info.get_intergenic( self['infile']['intragenic_bed'] ) Cnt_Info = m_cnt.CountInfo( self.HTS_u, l_brief_samp, "dexseq_NeoRaw", self.HTS ) Cnt_Info.generate_mat() Cnt_Info.load_mat() Cnt_Info.cal_RPKM( Gtf_Info.gene,self.tophat ) rpkm_file = "%s/merge.%s.RPKM.xls" % ( self.HTS,"dexseq_NeoRaw" ) Gtf_Info.load_gene_RPKM( rpkm_file ) Gtf_Info.output_GTF() Gtf_Info.get_gene_info()
def Basic_Stat(self): """ Stat for QC, Tophat mapping, ERCC RGC count """ out_file = "%s/01.BasicInfo_QC_map_SpikeIn.xls" % (self.statInfo) f_out_file = open(out_file, "w") out_info = "Sample\tBrief_samp\tRaw_Reads\tClean_Reads\t" +\ "Pre_Map_Reads\tAligned_Reads\tHTSseq_Known_Reads\t" +\ "HTSeq_Refseq_Reads\tHTSeq_NONCODE_V4_Reads\tHTSeq_Nsmb_Reads\t" +\ "RFP_Reads\tGFP_Reads\tCRE_Reads\tERCC_Reads\t" +\ "RFP_Mols\tGFP_Mols\tCRE_Mols\tERCC_Mols" print >> f_out_file, out_info l_breif_samp = [ self['sam_info']['samp_brief'][samp] for samp in self['sample'] ] ''' Load refseq reads ''' HTS_info = m_cnt.CountInfo(self.HTS_k, l_breif_samp, "dexseq_clean", self.HTS) HTS_info.load_mat() HTS_info.sam_tot_reads() self.__get_HTS_clean_split() Refseq_info = m_cnt.CountInfo(self.HTS_k, l_breif_samp, "dexseq_clean_refseq", self.HTS) Refseq_info.load_mat() Refseq_info.sam_tot_reads() NONCODE_info = m_cnt.CountInfo(self.HTS_k, l_breif_samp, "dexseq_clean_NONCODE", self.HTS) NONCODE_info.load_mat() NONCODE_info.sam_tot_reads() NSMB_info = m_cnt.CountInfo(self.HTS_k, l_breif_samp, "dexseq_clean_NSMB", self.HTS) NSMB_info.load_mat() NSMB_info.sam_tot_reads() ''' Load other information ''' for idx, samp in enumerate(self['sample']): brief_name = self['sam_info']['samp_brief'][samp] QC_log = "%s/%s/log" % (self.cln, samp) Tophat_log = "%s/%s/align_summary.txt" % (self.tophat, brief_name) HTSeq_SpikeIn = "%s/%s/%s.dexseq_ERCC_RGCPloyA.txt" % ( self.HTS_k, brief_name, brief_name) QcStat_info = Stat.QcStat(QC_log) MapStat_info = Stat.TophatStat(Tophat_log) SpikeIn_info = Stat.SpikeIn(HTSeq_SpikeIn, self['infile']['ercc_info']) QcStat_info.read_infile() MapStat_info.read_infile() SpikeIn_info.load_HTS_file() pre_map_read = MapStat_info['statInfo']['totalRead'] aligned_read = MapStat_info['statInfo']['mappedRead'] if self['sam_info']['data_type'][samp] == "PE": HTSseq_read = self.__get_HTS_reads(HTS_info, samp) * 2 Refseq_read = self.__get_HTS_reads(Refseq_info, samp) * 2 NONCODE_read = self.__get_HTS_reads(NONCODE_info, samp) * 2 NSMB_read = self.__get_HTS_reads(NSMB_info, samp) * 2 read_RFP = SpikeIn_info.RGC_count['RGC-mRFP'] * 2 read_GFP = SpikeIn_info.RGC_count['RGC-GFP'] * 2 read_CRE = SpikeIn_info.RGC_count['RGC-CRE'] * 2 read_ERCC = SpikeIn_info.ERCC_total * 2 else: HTSseq_read = self.__get_HTS_reads(HTS_info, samp) Refseq_read = self.__get_HTS_reads(Refseq_info, samp) NONCODE_read = self.__get_HTS_reads(NONCODE_info, samp) NSMB_read = self.__get_HTS_reads(NSMB_info, samp) read_RFP = SpikeIn_info.RGC_count['RGC-mRFP'] read_GFP = SpikeIn_info.RGC_count['RGC-GFP'] read_CRE = SpikeIn_info.RGC_count['RGC-CRE'] read_ERCC = SpikeIn_info.ERCC_total mol_RFP = self['sam_info']['RFP_mols'][samp] mol_GFP = self['sam_info']['GFP_mols'][samp] mol_CRE = self['sam_info']['CRE_mols'][samp] mol_ERCC = self['sam_info']['dilute'][samp] * 6.023 * 10**10 out_info = "%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%1.2e\t%1.2e\t%1.2e\t%1.2e" \ % ( samp, brief_name, QcStat_info.raw_reads, QcStat_info.cln_reads, \ pre_map_read, aligned_read, HTSseq_read, \ Refseq_read,NONCODE_read,NSMB_read, \ read_RFP, read_GFP,read_CRE, read_ERCC, \ mol_RFP , mol_GFP ,mol_CRE , mol_ERCC ) print >> f_out_file, out_info f_out_file.close()
def Basic_Stat(self): """ Stat for QC, Tophat mapping, ERCC RGC count """ out_file = "%s/01.BasicInfo_QC_map_SpikeIn.xls" % (self.statInfo) f_out_file = open(out_file, "w") out_info = "Sample\tBrief_samp\tRaw_Reads\tClean_Reads\t" +\ "Pre_Map_Reads\tAligned_Reads\tRefseq_Reads\t" +\ "Circular_genes\tCircular_exons\tCircular_reads\t"+\ "RFP_Reads\tGFP_Reads\tCRE_Reads\tERCC_Reads\t" +\ "RFP_Mols\tGFP_Mols\tCRE_Mols\tERCC_Mols" print >> f_out_file, out_info l_breif_samp = [ self['sam_info']['samp_brief'][samp] for samp in self['sample'] ] ''' Load refseq reads ''' Refseq_info = m_cnt.CountInfo(self.HTS_k, l_breif_samp, "dexseq_clean", self.HTS) Refseq_info.load_mat() Refseq_info.sam_tot_reads() ''' Load circular reads. Run this step after CIRC_Stat. ''' exon_level_MinDepth = "%s/04.CIRC_info/CIRC_PE_result.merge_exon_level.MinDepth_%d.xls" % ( self.statInfo, self.min_depth) gene_level_MinDepth = "%s/04.CIRC_info/CIRC_PE_result.merge_gene_level.MinDepth_%d.xls" % ( self.statInfo, self.min_depth) exon_level_mat = m_mat.Matrix_info(exon_level_MinDepth, 2) gene_level_mat = m_mat.Matrix_info(gene_level_MinDepth, 1) exon_level_mat.load_mat() gene_level_mat.load_mat() # How many genes have cirRNA with depth>2 in one junction of this gene for a given sample? # How many exons have cirRNA with depth>2 in one junction for a given sample? # Sum( reads ) for a given sample? np_gene_cirs_samp = np.sum(gene_level_mat.matrix >= self.min_depth, axis=0) np_exon_cirs_samp = np.sum(exon_level_mat.matrix >= self.min_depth, axis=0) np_exon_read_samp = np.sum(exon_level_mat.matrix, axis=0) ''' Load other information ''' for idx, samp in enumerate(self['sample']): brief_name = self['sam_info']['samp_brief'][samp] QC_log = "%s/%s/log" % (self.cln, samp) Tophat_log = "%s/%s/align_summary.txt" % (self.tophat, brief_name) HTSeq_SpikeIn = "%s/%s/%s.dexseq_ERCC_RGCPloyA.txt" % ( self.HTS_k, brief_name, brief_name) QcStat_info = Stat.QcStat(QC_log) MapStat_info = Stat.TophatStat(Tophat_log) SpikeIn_info = Stat.SpikeIn(HTSeq_SpikeIn, self['infile']['ercc_info']) QcStat_info.read_infile() MapStat_info.read_infile() SpikeIn_info.load_HTS_file() pre_map_read = MapStat_info['statInfo']['totalRead'] aligned_read = MapStat_info['statInfo']['mappedRead'] refseq_read = self.__get_refseq_reads(Refseq_info, samp) cir_genes = np_gene_cirs_samp[idx] cir_exons = np_exon_cirs_samp[idx] cir_reads = np_exon_read_samp[idx] read_RFP = SpikeIn_info.RGC_count['RGC-mRFP'] read_GFP = SpikeIn_info.RGC_count['RGC-GFP'] read_CRE = SpikeIn_info.RGC_count['RGC-CRE'] read_ERCC = SpikeIn_info.ERCC_total mol_RFP = self['sam_info']['RFP_mols'][samp] mol_GFP = self['sam_info']['GFP_mols'][samp] mol_CRE = self['sam_info']['CRE_mols'][samp] mol_ERCC = self['sam_info']['dilute'][samp] * 6.023 * 10**10 out_info = "%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%1.2e\t%1.2e\t%1.2e\t%1.2e" \ % ( samp, brief_name, QcStat_info.raw_reads, QcStat_info.cln_reads, \ pre_map_read, aligned_read, refseq_read, \ cir_genes, cir_exons , cir_reads, \ read_RFP, read_GFP,read_CRE, read_ERCC, \ mol_RFP , mol_GFP ,mol_CRE , mol_ERCC ) print >> f_out_file, out_info f_out_file.close()