Exemple #1
0
    def __cir_mols(self, exon_level_MinDepth, min_depth=0):
        l_brief_name = [
            self['sam_info']['samp_brief'][samp] for samp in self['sample']
        ]
        ltype = "circ_mols"
        inFile = "%s/04.CIRC_info" % (self.statInfo)
        outFile = "%s/04.CIRC_info" % (self.statInfo)
        circ_Mols = m_cnt.CountInfo(inFile, l_brief_name, ltype, outFile)
        circ_Mols.load_mat(exon_level_MinDepth, gen_col=2)
        circ_Mols.sam_tot_reads()

        M_gen_len = {}
        for gene in circ_Mols.gene:
            M_gen_len[gene] = {
                'max_len': 200
            }  #### Please ONLY USE samples with less index, that sequenced by illumina X10 !!!!!!

        circ_Mols.cal_RPKM(M_gen_len, self.tophat)
        out_file_RPKM = "%s/merge.%s.RPKM.xls" % (outFile, ltype)

        cirRPKM_mat = m_mat.Matrix_info(out_file_RPKM,
                                        inf_column=1,
                                        in_dtype="float")
        cirRPKM_mat.load_mat()
        l_cir_genes = cirRPKM_mat.colname

        for brief_name in l_brief_name:
            idx_rowname = cirRPKM_mat.rowname.index(brief_name)
            self.l_cirRNA_FPKM[brief_name] = cirRPKM_mat.matrix[:, idx_rowname]

        self.__get_cirRNA_MOLs(
        )  # get cirRNA mols using ERCC_FPKM, ERCC_MOLs and cirRNA_FPKM
 def __load_Count(self):  # for ERCC count Stat
    l_breif_samp = [ self['sam_info']['samp_brief'][samp] for samp in self['sample'] ]
    
    genome_gtf = self['infile']['anno_file']
    Gtf_Info = m_gtf.GTFFeature( genome_gtf  )
    
    ERCC_info = m_cnt.CountInfo( self.HTS_k,l_breif_samp,"dexseq_ERCC_RGCPloyA",self.HTS )
    ERCC_info.load_mat()
    ERCC_info.cal_RPKM( Gtf_Info.gene,self.tophat )
   def RPKM_novo_trans(self):
      l_brief_samp    = [ "%s" % ( self['samp_info']['samp_brief'][samp] ) for samp in self['samp'] ]
      unknown_GTF     = "%s/novo_lnc_raw.combined.gtf" % ( self.data_dir )
      
      Gtf_Info = m_gtf.GTFFeature( unknown_GTF  )
      Gtf_Info.get_intergenic( self['infile']['intragenic_bed'] )
      Cnt_Info = m_cnt.CountInfo(  self.HTS_u, l_brief_samp, "dexseq_NeoRaw", self.HTS  )

      Cnt_Info.generate_mat()
      Cnt_Info.load_mat()
      Cnt_Info.cal_RPKM( Gtf_Info.gene,self.tophat )
      
      rpkm_file = "%s/merge.%s.RPKM.xls" % ( self.HTS,"dexseq_NeoRaw" )
      
      Gtf_Info.load_gene_RPKM( rpkm_file )
      Gtf_Info.output_GTF()
      Gtf_Info.get_gene_info()
Exemple #4
0
    def Basic_Stat(self):
        """
      Stat for QC, Tophat mapping, ERCC RGC count
      """
        out_file = "%s/01.BasicInfo_QC_map_SpikeIn.xls" % (self.statInfo)
        f_out_file = open(out_file, "w")
        out_info   = "Sample\tBrief_samp\tRaw_Reads\tClean_Reads\t"    +\
                     "Pre_Map_Reads\tAligned_Reads\tHTSseq_Known_Reads\t"    +\
                     "HTSeq_Refseq_Reads\tHTSeq_NONCODE_V4_Reads\tHTSeq_Nsmb_Reads\t" +\
                     "RFP_Reads\tGFP_Reads\tCRE_Reads\tERCC_Reads\t"   +\
                     "RFP_Mols\tGFP_Mols\tCRE_Mols\tERCC_Mols"

        print >> f_out_file, out_info

        l_breif_samp = [
            self['sam_info']['samp_brief'][samp] for samp in self['sample']
        ]
        '''
         Load refseq reads
      '''
        HTS_info = m_cnt.CountInfo(self.HTS_k, l_breif_samp, "dexseq_clean",
                                   self.HTS)
        HTS_info.load_mat()
        HTS_info.sam_tot_reads()

        self.__get_HTS_clean_split()

        Refseq_info = m_cnt.CountInfo(self.HTS_k, l_breif_samp,
                                      "dexseq_clean_refseq", self.HTS)
        Refseq_info.load_mat()
        Refseq_info.sam_tot_reads()

        NONCODE_info = m_cnt.CountInfo(self.HTS_k, l_breif_samp,
                                       "dexseq_clean_NONCODE", self.HTS)
        NONCODE_info.load_mat()
        NONCODE_info.sam_tot_reads()

        NSMB_info = m_cnt.CountInfo(self.HTS_k, l_breif_samp,
                                    "dexseq_clean_NSMB", self.HTS)
        NSMB_info.load_mat()
        NSMB_info.sam_tot_reads()
        '''
         Load other information
      '''
        for idx, samp in enumerate(self['sample']):
            brief_name = self['sam_info']['samp_brief'][samp]
            QC_log = "%s/%s/log" % (self.cln, samp)
            Tophat_log = "%s/%s/align_summary.txt" % (self.tophat, brief_name)
            HTSeq_SpikeIn = "%s/%s/%s.dexseq_ERCC_RGCPloyA.txt" % (
                self.HTS_k, brief_name, brief_name)

            QcStat_info = Stat.QcStat(QC_log)
            MapStat_info = Stat.TophatStat(Tophat_log)
            SpikeIn_info = Stat.SpikeIn(HTSeq_SpikeIn,
                                        self['infile']['ercc_info'])

            QcStat_info.read_infile()
            MapStat_info.read_infile()
            SpikeIn_info.load_HTS_file()

            pre_map_read = MapStat_info['statInfo']['totalRead']
            aligned_read = MapStat_info['statInfo']['mappedRead']

            if self['sam_info']['data_type'][samp] == "PE":
                HTSseq_read = self.__get_HTS_reads(HTS_info, samp) * 2
                Refseq_read = self.__get_HTS_reads(Refseq_info, samp) * 2
                NONCODE_read = self.__get_HTS_reads(NONCODE_info, samp) * 2
                NSMB_read = self.__get_HTS_reads(NSMB_info, samp) * 2
                read_RFP = SpikeIn_info.RGC_count['RGC-mRFP'] * 2
                read_GFP = SpikeIn_info.RGC_count['RGC-GFP'] * 2
                read_CRE = SpikeIn_info.RGC_count['RGC-CRE'] * 2
                read_ERCC = SpikeIn_info.ERCC_total * 2
            else:
                HTSseq_read = self.__get_HTS_reads(HTS_info, samp)
                Refseq_read = self.__get_HTS_reads(Refseq_info, samp)
                NONCODE_read = self.__get_HTS_reads(NONCODE_info, samp)
                NSMB_read = self.__get_HTS_reads(NSMB_info, samp)
                read_RFP = SpikeIn_info.RGC_count['RGC-mRFP']
                read_GFP = SpikeIn_info.RGC_count['RGC-GFP']
                read_CRE = SpikeIn_info.RGC_count['RGC-CRE']
                read_ERCC = SpikeIn_info.ERCC_total

            mol_RFP = self['sam_info']['RFP_mols'][samp]
            mol_GFP = self['sam_info']['GFP_mols'][samp]
            mol_CRE = self['sam_info']['CRE_mols'][samp]
            mol_ERCC = self['sam_info']['dilute'][samp] * 6.023 * 10**10

            out_info =  "%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%1.2e\t%1.2e\t%1.2e\t%1.2e"   \
               % ( samp, brief_name, QcStat_info.raw_reads, QcStat_info.cln_reads,  \
                   pre_map_read, aligned_read, HTSseq_read,                         \
                   Refseq_read,NONCODE_read,NSMB_read,                              \
                   read_RFP, read_GFP,read_CRE, read_ERCC,                          \
                   mol_RFP , mol_GFP ,mol_CRE , mol_ERCC )
            print >> f_out_file, out_info
        f_out_file.close()
Exemple #5
0
    def Basic_Stat(self):
        """
      Stat for QC, Tophat mapping, ERCC RGC count
      """
        out_file = "%s/01.BasicInfo_QC_map_SpikeIn.xls" % (self.statInfo)
        f_out_file = open(out_file, "w")
        out_info   = "Sample\tBrief_samp\tRaw_Reads\tClean_Reads\t"    +\
                     "Pre_Map_Reads\tAligned_Reads\tRefseq_Reads\t"    +\
                     "Circular_genes\tCircular_exons\tCircular_reads\t"+\
                     "RFP_Reads\tGFP_Reads\tCRE_Reads\tERCC_Reads\t"   +\
                     "RFP_Mols\tGFP_Mols\tCRE_Mols\tERCC_Mols"

        print >> f_out_file, out_info

        l_breif_samp = [
            self['sam_info']['samp_brief'][samp] for samp in self['sample']
        ]
        '''
         Load refseq reads
      '''
        Refseq_info = m_cnt.CountInfo(self.HTS_k, l_breif_samp, "dexseq_clean",
                                      self.HTS)
        Refseq_info.load_mat()
        Refseq_info.sam_tot_reads()
        '''
         Load circular reads. Run this step after CIRC_Stat.
      '''
        exon_level_MinDepth = "%s/04.CIRC_info/CIRC_PE_result.merge_exon_level.MinDepth_%d.xls" % (
            self.statInfo, self.min_depth)
        gene_level_MinDepth = "%s/04.CIRC_info/CIRC_PE_result.merge_gene_level.MinDepth_%d.xls" % (
            self.statInfo, self.min_depth)

        exon_level_mat = m_mat.Matrix_info(exon_level_MinDepth, 2)
        gene_level_mat = m_mat.Matrix_info(gene_level_MinDepth, 1)

        exon_level_mat.load_mat()
        gene_level_mat.load_mat()

        # How many genes have cirRNA with depth>2 in one junction of this gene for a given sample?
        # How many exons have cirRNA with depth>2 in one junction for a given sample?
        # Sum( reads ) for a given sample?
        np_gene_cirs_samp = np.sum(gene_level_mat.matrix >= self.min_depth,
                                   axis=0)
        np_exon_cirs_samp = np.sum(exon_level_mat.matrix >= self.min_depth,
                                   axis=0)
        np_exon_read_samp = np.sum(exon_level_mat.matrix, axis=0)
        '''
         Load other information
      '''
        for idx, samp in enumerate(self['sample']):
            brief_name = self['sam_info']['samp_brief'][samp]
            QC_log = "%s/%s/log" % (self.cln, samp)
            Tophat_log = "%s/%s/align_summary.txt" % (self.tophat, brief_name)
            HTSeq_SpikeIn = "%s/%s/%s.dexseq_ERCC_RGCPloyA.txt" % (
                self.HTS_k, brief_name, brief_name)

            QcStat_info = Stat.QcStat(QC_log)
            MapStat_info = Stat.TophatStat(Tophat_log)
            SpikeIn_info = Stat.SpikeIn(HTSeq_SpikeIn,
                                        self['infile']['ercc_info'])

            QcStat_info.read_infile()
            MapStat_info.read_infile()
            SpikeIn_info.load_HTS_file()

            pre_map_read = MapStat_info['statInfo']['totalRead']
            aligned_read = MapStat_info['statInfo']['mappedRead']
            refseq_read = self.__get_refseq_reads(Refseq_info, samp)

            cir_genes = np_gene_cirs_samp[idx]
            cir_exons = np_exon_cirs_samp[idx]
            cir_reads = np_exon_read_samp[idx]

            read_RFP = SpikeIn_info.RGC_count['RGC-mRFP']
            read_GFP = SpikeIn_info.RGC_count['RGC-GFP']
            read_CRE = SpikeIn_info.RGC_count['RGC-CRE']
            read_ERCC = SpikeIn_info.ERCC_total

            mol_RFP = self['sam_info']['RFP_mols'][samp]
            mol_GFP = self['sam_info']['GFP_mols'][samp]
            mol_CRE = self['sam_info']['CRE_mols'][samp]
            mol_ERCC = self['sam_info']['dilute'][samp] * 6.023 * 10**10

            out_info =  "%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%1.2e\t%1.2e\t%1.2e\t%1.2e"   \
               % ( samp, brief_name, QcStat_info.raw_reads, QcStat_info.cln_reads,  \
                   pre_map_read, aligned_read, refseq_read,                         \
                   cir_genes,    cir_exons   , cir_reads,                           \
                   read_RFP, read_GFP,read_CRE, read_ERCC,                          \
                   mol_RFP , mol_GFP ,mol_CRE , mol_ERCC )
            print >> f_out_file, out_info
        f_out_file.close()