コード例 #1
0
class Annotator(object):
    def __init__(self,args):
        self.args = args
        pass

    @staticmethod
    def do_fisher_test(x,num1,num2):
        """do fisher-exact test for go annotation"""
        fy = x['gene_num']
        by = x['othergene_num']
        o,p = fisher_exact([[fy,num1 - fy],[by,num2 - by]],alternative="greater")
        return p
        pass

    @staticmethod
    def do_sig_tag(x):
        if x <= 0.001:
            return "***"
        elif x <= 0.01:
            return "**"
        elif x <= 0.05:
            return "*"
        else:
            return "-"

    @staticmethod
    def p_adjust_bh(p):
        """Benjamini-Hochberg p-value correction for multiple hypothesis testing."""
        p = asfarray(p)
        by_descend = p.argsort()[::-1]
        by_orig = by_descend.argsort()
        steps = float(len(p)) / arange(len(p), 0, -1)
        q = minimum(1, minimum.accumulate(steps * p[by_descend]))
        return q[by_orig]

    @staticmethod
    def to_output(h_values,wb,outtag,cname,title):
        if outtag.lower() == "ms-excel":
            h_values.to_excel(wb,sheet_name = "Cluster " + cname + " " + title,index=False)
        else:
            h_values.to_csv(wb,sep="\t",quotechar = "\t",index=False,header=False)
        pass

    @staticmethod
    def translate_go(name="go.obo"):
        """extract go annotation dicts from obo"""
        ids = []
        names = []
        cls = []
        infile = None
        if name.endswith(".gz"):
            infile = gzip.open(name,"rt")
        else:
            infile = open(name,"rt")
        for line in infile:
            if line.strip().startswith("id:"):
                ids.append(line.strip()[4:])
            if line.strip().startswith("name:"):
                names.append(line.strip()[6:])
            if line.strip().startswith("namespace:"):
                cls.append(line.strip()[11:])
        infile.close()
        return dict(zip(ids,names))


    def do_go_annotation(self,gof,fore,back,cname,gtype):
        """return go annotation with significance tag"""
        fil = gof[2].map(lambda value: len(set([value]) & fore) > 0)
        fgnames = gof[fil].groupby(by=4)[2].unique()
        bfil = gof[2].map(lambda value: len(set([value]) & back) > 0)
        bgnames = gof[bfil].groupby(by=4)[2].unique()
        dat = DataFrame({"genes":fgnames,"othergenes":bgnames})
        num1 = len(fore)
        num2 = len(back)
        dat = dat.dropna(axis=0)
        dat['gene_num'] = dat['genes'].map(lambda x:x.size)
        dat['othergene_num'] = dat['othergenes'].map(lambda x:x.size)
        if dat.size ==0:
            return dat

        dat['p-value'] =  dat.apply(Annotator.do_fisher_test,axis=1,args=(num1,num2))
        used = dat.loc[:,dat.columns]   #.head(5)
        used['ids'] = used.index
        used['q-value'] = Annotator.p_adjust_bh(used['p-value'])
        used['sig'] = used['q-value'].map(Annotator.do_sig_tag)
        used['go_name'] = used['ids'].map(lambda x:self.gos[x])
        outs = used.sort_values(by='p-value')[['ids','gene_num','othergene_num','p-value','q-value',"sig",'go_name']]
        if self.args.noprint == False:
            print(outs.head(5).to_string(index=False))
        if self.args.output:
            outs['cluster'] = cname
            outs['go_class'] = gtype
        return outs
        pass

    def print_class(self,h_values,cname):
        """print cell predictions with scores."""
        o = ""
        titlebar = "-" * 60 + "\n"
        #print(h_values)
        #print(h_values.size)
        #if h_values is not None:
        #    print(h_values.size)
        if h_values is None:
            if self.args.noprint:
                return "E",None,"-","-","-"
            o += titlebar
            o += "{0:<10}{1:^30}{2:<10}".format("Type","Cell Type","Score")
            o += "\n" + "-"*60 + "\n"
            o += "{0:<10}{1:^30}{2:<10}".format("-","-","-")
            o += "\n" + titlebar
            return "E",None,"-","-","-"
        elif h_values.size == 0:
            if self.args.noprint:
                return "N",None,"-","-","-"
            o += titlebar
            o += "{0:<10}{1:^30}{2:<10}".format("Type","Cell Type","Score")
            o += "\n" + "-"*60 + "\n"
            o += "{0:<10}{1:^30}{2:<10}".format("-","-","-")
            o += "\n" + titlebar
            return "N",None,"-","-","-"
        elif h_values.size == 3:
            if self.args.noprint:
                return "Good",o,h_values.values[0][0],h_values.values[0][1],"-"
            o += titlebar
            o += "{0:<10}{1:^30}{2:<10}{3:<5}".format("Type","Cell Type","Score","Times")
            o += "\n" + "-"*60 + "\n"
            o += "{0:<10}{1:^30}{2:<10.4f}".format("Good",h_values.values[0][0],h_values.values[0][1])
            o += "\n" + titlebar
            return "Good",o,h_values.values[0][0],h_values.values[0][1],"-"
            pass
        elif h_values.size == 2:
            if self.args.noprint:
                return "Good",o,h_values.values[0][0],h_values.values[0][1],"-"
            o += titlebar
            o += "{0:<10}{1:^30}{2:<10}{3:<5}".format("Type","Cell Type","Score","Times")
            o += "\n" + "-"*60 + "\n"
            o += "{0:<10}{1:^30}{2:<10.4f}".format("Good",h_values.values[0][0],h_values.values[0][1])
            o += "\n" + titlebar
            return "Good",o,h_values.values[0][0],h_values.values[0][1],"-"
            pass
        elif float(h_values.iloc[0,1])/float(h_values.iloc[1,1]) >= 2 or float(h_values.iloc[1,1] < 0):
            times = np.abs(float(h_values.iloc[0,1])/float(h_values.iloc[1,1]))
            if self.args.noprint:
                return "Good",o,h_values.values[0][0],h_values.values[0][1],times
            o += titlebar
            o += "{0:<10}{1:^30}{2:<10}{3:<5}".format("Type","Cell Type","Score","Times")
            o += "\n" + titlebar
            o += "{0:<10}{1:^30}{2:<10.4f}{3:<5.1f}".format("Good",h_values['Cell Type'].values[0],h_values['Z-score'].values[0],times)
            o += "\n" + titlebar
            return "Good",o,h_values['Cell Type'].values[0],h_values['Z-score'].values[0],times
            pass
        else:
            times = np.abs(float(h_values.iloc[0,1])/float(h_values.iloc[1,1]))
            if self.args.noprint:
                return "?",o,str(h_values['Cell Type'].values[0]) + "|" + str(h_values['Cell Type'].values[1]),str(h_values['Z-score'].values[0]) + "|" + str(h_values['Z-score'].values[1]),times
            o += titlebar
            o += "{0:<10}{1:^30}{2:<10}{3:<5}".format("Type","Cell Type","Score","Times")
            o += "\n" + titlebar
            o += "{0:<10}{1:^30}{2:<10.4f}{3:<5.1f}".format("?",h_values['Cell Type'].values[0],h_values['Z-score'].values[0],times)
            o += "\n" + titlebar
            o += "{0:<10}{1:^29}({2:<.4f})".format("","("+h_values['Cell Type'].values[1]+")",h_values['Z-score'].values[1])
            o += "\n" + titlebar
            return "?",o,str(h_values['Cell Type'].values[0]) + "|" + str(h_values['Cell Type'].values[1]),str(h_values['Z-score'].values[0]) + "|" + str(h_values['Z-score'].values[1]),times
        pass

    def deal_with_badtype(self,cname,other_gene_names,colnames):
        """go annotation need to be performed"""
        if len(self.human_gofs) != 0:
            fset = set()
            bset = set()
            if colnames is None:
                print("!WARNING(go processing):Zero gene sets found for the cluster",cname)
                print("!WARNING(go processing):Change the threshold and try again?")
                return 
            for c in colnames:
                if c in self.ensem_hgncs:
                    fset.add(self.ensem_hgncs[c])
                else:
                    fset.add(c)
            for c in other_gene_names:
                if c in self.ensem_hgncs:
                    bset.add(self.ensem_hgncs[c])
                else:
                    bset.add(c)
            if len(fset) == 0:
                print("!WARNING(go processing):Zero gene sets found for the cluster",cname)
                print("!WARNING(go processing):Change the threshold and try again?")
                return 
            if len(bset) == 0:
                print("!WARNING(go processing):Zero gene sets found for other clusters")
                print("!WARNING(go processing):Change the threshold and try again?")
                return 
            names = ["Function","Component","Process"]
            if self.args.noprint == False:
                print("Go Enrichment analysis:","Group1:",len(fset),"Group2:",len(bset))
            if len(fset) > 0 and len(bset) > 0:
                all_outs = DataFrame()
                for i,f in enumerate(self.human_gofs):
                    o = " ".join([">"*30,names[i], "<"*30])
                    if self.args.noprint == False:
                        print(o)
                    outs = self.do_go_annotation(f,fset,bset,cname,names[i][0])
                    if outs.size == 0:continue
                    if all_outs.size == 0:
                        all_outs = outs
                    else:
                        all_outs = all_outs.append(outs)
                    if self.args.noprint == False:
                        print()
                if self.args.output:
                    Annotator.to_output(all_outs,self.wbgo,self.args.outfmt,cname,"GO")



    def calcu_cellranger_group(self,expfile,hgvc=False):
        """deal with cellranger input matrix"""
        exps = read_csv(expfile)
        columns = exps.columns

        pre,suf,suf1 ="Cluster "," UMI counts/cell"," Weight"
        fid = "Gene Name" if hgvc == True else "Gene ID"
        gcol = "gene" if hgvc == True else "ensemblID"
        ccol = "cellName"

        if self.args.target.lower() not in ["cancersea","cellmarker"]:
            print("Error target : -t, --target,(cellmarker,[cancersea])")
            sys.exit(0)

        if self.args.target.lower() == "cancersea":
            gcol = "gene" if hgvc == True else "ensemblID"
            ccol = "name"


        abs_tag = False

        cnum = int(len(exps.columns) / 2 - 1)
        ver_tag = "V1"
        pname = ""

        if "Feature ID" in columns: # v3
            fid = "Feature Name" if hgvc == True else "Feature ID"
            pre,suf,suf1 ="Cluster "," Mean Counts"," Log2 fold change"
            cnum = int((len(exps.columns)-2) / 3)
            pname = " Adjusted p value"
            self.args.weight = self.args.foldchange
            ver_tag = "V3"
        elif "Cluster 1 Mean UMI Counts" in columns: # v2
            fid = "Gene Name" if hgvc == True else "Gene ID"
            pre,suf,suf1 ="Cluster "," Mean Counts"," Log2 fold change"
            cnum = int((len(exps.columns)-2) / 3)
            self.args.weight = self.args.foldchange
            pname = " Adjusted p value"
            ver_tag = "V2"
        outs = []

        self.wb = self.wbgo = None
        if self.args.output:
            if self.args.outfmt.lower() == "ms-excel":
                if not self.args.output.endswith(".xlsx") and (not self.args.output.endswith(".xls")):
                    self.args.output += ".xlsx"
                self.wb = ExcelWriter(self.args.output)
                self.wbgo = self.wb
            elif self.args.outfmt.lower() == "txt":
                self.wb = open(self.args.output,"w")
                self.wb.write("Cell Type\tZ-score\tCluster\n")
                self.wbgo = open(self.args.output + ".go","w")
                self.wbgo.write('ids\tgene_num\tothergene_num\tp-value\tq-value\tsig\tname\tcluster\tgo_class\n')
            else:
                print("Error output format: -m, --outfmt,(ms-excel,[txt])")
                sys.exit(0)

        

        for i in range(1,cnum+1):
            cname = str(i)
            if self.args.cluster != "all":
                if self.args.cluster.find(",") > -1:
                    sets = self.args.cluster.split(",")
                    if cname not in sets:
                        continue
                else:
                    if cname != self.args.cluster:
                        continue
            #if i != 1 :continue
            o = " ".join(["#"*30,"Cluster",cname, "#"*30]) + "\n"
            if self.args.noprint == False:
                print(o)
            ptitle = pre + cname + pname
            ltitle = pre + cname + suf1
            if ltitle not in exps.columns:
                print(ltitle,"column not in the input table!")
                sys.exit(0)

            newexps = None
            if ver_tag == "V1":
                newexps = exps[exps[ltitle]>=self.args.weight]
            else:
                newexps = exps[(exps[ltitle]>=self.args.weight) & (exps[ptitle] <= self.args.pvalue)]
            #print(newexps.shape)
            h_values,colnames = self.get_cell_matrix(newexps,ltitle,fid,gcol,ccol,abs_tag)
            #print(newexps)
            print("Cluster " + cname + " Gene number:",newexps['Gene ID'].unique().shape[0])
            if h_values is None:
                t,o_str,c,v,times = self.print_class(h_values,cname)
                outs.append([cname,t,c,v,times])
                if self.args.noprint == False:
                    print(o_str)
                continue
            h_values['Cluster'] = cname
            if self.args.output:
                Annotator.to_output(h_values,self.wb,self.args.outfmt,cname,"Cell Type")


            t,o_str,c,v,times = self.print_class(h_values,cname)
            outs.append([cname,t,c,v,times])
            if self.args.noprint == False:
                print(o_str)
            other_gene_names = set()
            for j in range(1,cnum + 1):
                if i ==j :continue
                jtitle = pre + str(j) + suf1
                otherexps = None
                if ver_tag == "V1":
                    otherexps = exps[exps[jtitle]>=self.args.weight]
                else:
                    otherexps = exps[(exps[jtitle]>=self.args.weight) & (exps[ptitle] <= self.args.pvalue)]
                if self.args.target.lower() == "cancersea":
                    tfc,trownames,trownum,tcolnames,tcolnum = self.get_cell_gene_names(otherexps,self.smarkers,fid,gcol,ccol,"other")
                    other_gene_names |= set(tcolnames)
                elif self.args.target.lower() == "cellmarker":
                    tfc,trownames,trownum,tcolnames,tcolnum = self.get_cell_gene_names(otherexps,self.cmarkers,fid,gcol,ccol,"other")
                    if not trownames:
                        #print("WARNING3:Zero gene sets found for the cluster" + str(j))
                        #print("WARNING3:Change the threshold and try again?")
                        continue
                    other_gene_names |= set(tcolnames)
            #print("Other Gene number:",len(other_gene_names))
            self.deal_with_badtype(cname,other_gene_names,colnames)
        if self.args.output:
            self.wb.close()
            self.wbgo.close()
        if self.args.noprint == False:
            print("#"*80 + "\n")
        return outs

    def calcu_seurat_group(self,expfile,hgvc=False):
        """deal with seurat input matrix"""
        exps = read_csv(expfile)
        pre,suf,suf1 ="avg_logFC"," UMI counts/cell",""
        fid = "gene"
        pname = "p_val_adj"
        assert fid in exps.columns, 'No "gene" column. Wrong format? Seurat, Scanpy or Cellranger?'
        exps[fid] = exps[fid].str.replace("\.\d+","")
        cluster = "cluster"
        gcol = "gene" if hgvc == True else "ensemblID"
        ccol = "cellName"

        if self.args.target.lower() not in ["cancersea","cellmarker"]:
            print("Error target : -t, --target,(cellmarker,[cancersea])")
            sys.exit(0)

        if self.args.target.lower() == "cancersea":
            gcol = "gene" if hgvc == True else "ensemblID"
            ccol = "name"

        cnum = list(exps[cluster].unique())
        abs_tag = True
        outs = []
        self.wb = self.wbgo = None
        if self.args.output:
            if self.args.outfmt.lower() == "ms-excel":
                if not self.args.output.endswith(".xlsx") and (not self.args.output.endswith(".xls")):
                    self.args.output += ".xlsx"
                self.wb = ExcelWriter(self.args.output)
                self.wbgo = self.wb
            elif self.args.outfmt.lower() == "txt":
                self.wb = open(self.args.output,"w")
                if self.args.target == "cancersea":
                    self.wb.write("Cell Type\tZ-score\tNote\tCluster\n")
                else:
                    self.wb.write("Cell Type\tZ-score\tCluster\n")
                self.wbgo = open(self.args.output + ".go","w")
                self.wbgo.write('ids\tgene_num\tothergene_num\tp-value\tq-value\tsig\tname\tcluster\tgo_class\n')
            else:
                print("Error output format: -m, -outfmt,(ms-excel,[txt])")
                sys.exit(0)

        for i in cnum:
            cname = str(i)
            if self.args.cluster != "all":
                if self.args.cluster.find(",") > -1:
                    sets = self.args.cluster.split(",")
                    if cname not in sets:
                        continue
                else:
                    if cname != self.args.cluster:
                        continue
            o = " ".join(["#"*30,"Cluster",cname, "#"*30]) + "\n"
            if self.args.noprint == False:
                print(o)
            ltitle = pre
            ptitle = pname
            if ltitle not in exps.columns:
                print(ltitle,"column not in the input table!")
                sys.exit(0)
            newexps = exps[(exps[cluster] == i) & (exps[ltitle]>=self.args.foldchange) & (exps[ptitle] <= self.args.pvalue)]
            #newexps = exps[(exps[cluster] == i) & (abs(exps[ltitle])>=self.args.foldchange) & (exps[ptitle] <= self.args.pvalue)]
            #print(newexps)
            #print(newexps)

            h_values,colnames = self.get_cell_matrix(newexps,ltitle,fid,gcol,ccol,abs_tag)
            print("Cluster " + cname + " Gene number:",newexps['gene'].unique().shape[0])
            #print(colnames)
            #for x in newexps['gene'].unique():
            #    print(x)
            #exit()
            if self.args.output:
                h_values['Cluster'] = cname
                Annotator.to_output(h_values,self.wb,self.args.outfmt,cname,"Cell Type")

            #print(h_values)
            t,o_str,c,v,times = self.print_class(h_values,cname)
            outs.append([cname,t,c,v,times])
            if self.args.noprint == False:
                print(o_str)

            otherexps = exps[(exps[cluster] != i) & (exps[ltitle]>=self.args.foldchange) & (exps[ptitle] <= self.args.pvalue)]
            #otherexps = exps[(exps[cluster] != i) & (abs(exps[ltitle])>=self.args.foldchange) & (exps[ptitle] <= self.args.pvalue)]

            if self.args.target.lower() == "cellmarker":
                tfc,trownames,trownum,tcolnames,tcolnum = self.get_cell_gene_names(otherexps,self.cmarkers,fid,gcol,ccol,'other')
                if not trownames:continue
                other_gene_names = set(tcolnames)
                self.deal_with_badtype(cname,other_gene_names,colnames)
            elif self.args.target.lower() == "cancersea":
                tfc,trownames,trownum,tcolnames,tcolnum = self.get_cell_gene_names(otherexps,self.smarkers,fid,gcol,ccol,'other')
                if not trownames:continue
                other_gene_names = set(tcolnames)
                self.deal_with_badtype(cname,other_gene_names,colnames)
            print("Other Gene number:",len(other_gene_names))
        if self.args.output:
            self.wb.close()
            self.wbgo.close()
        if self.args.noprint == False:
            print("#"*80 + "\n")
        return outs

    def calcu_scanpy_group(self,expfile,hgvc=False):
        """deal with scanpy input matrix"""
        exps = read_csv(expfile,index_col=0)
        cnum = set()
        pname = "p"
        pre = "l"
        fid = "n"
        for c in exps.columns:
            k,v = c.split("_")
            cnum.add(k)
            if v.startswith("p"):
                pname = v
            elif v.startswith("n"):
                rfid = v
            elif v.startswith("l"):
                pre = v
        
        #pre,suf,suf1 ="avg_logFC"," UMI counts/cell",""
        #fid = "gene"
        #pname = "p_val_adj"
        #assert fid in exps.columns, 'No "gene" column. Wrong format? Scanpy, Seurat or Cellranger?'
        #exps[fid] = exps[fid].str.replace("\.\d+","")
        #cluster = "cluster"

        ###MarkerBase
        gcol = "gene" if hgvc == True else "ensemblID"
        ccol = "cellName"

        if self.args.target.lower() not in ["cancersea","cellmarker"]:
            print("Error target : -t, --target,(cellmarker,[cancersea])")
            sys.exit(0)

        if self.args.target.lower() == "cancersea":
            gcol = "gene" if hgvc == True else "ensemblID"
            ccol = "name"

        #cnum = list(exps[cluster].unique())
        abs_tag = True
        outs = []
        self.wb = self.wbgo = None
        if self.args.output:
            if self.args.outfmt.lower() == "ms-excel":
                if not self.args.output.endswith(".xlsx") and (not self.args.output.endswith(".xls")):
                    self.args.output += ".xlsx"
                self.wb = ExcelWriter(self.args.output)
                self.wbgo = self.wb
            elif self.args.outfmt.lower() == "txt":
                self.wb = open(self.args.output,"w")
                if self.args.target == "cancersea":
                    self.wb.write("Cell Type\tZ-score\tNote\tCluster\n")
                else:
                    self.wb.write("Cell Type\tZ-score\tCluster\n")
                self.wbgo = open(self.args.output + ".go","w")
                self.wbgo.write('ids\tgene_num\tothergene_num\tp-value\tq-value\tsig\tname\tcluster\tgo_class\n')
            else:
                print("Error output format: -m, -outfmt,(ms-excel,[txt])")
                sys.exit(0)

        for i in list(sorted(cnum)):
            cname = str(i)
            if self.args.cluster != "all":
                if self.args.cluster.find(",") > -1:
                    sets = self.args.cluster.split(",")
                    if cname not in sets:
                        continue
                else:
                    if cname != self.args.cluster:
                        continue
            o = " ".join(["#"*30,"Cluster",cname, "#"*30]) + "\n"
            if self.args.noprint == False:
                print(o)
            ltitle = cname + "_" + pre
            fid = cname + "_" + rfid
            ptitle = cname + "_" + pname
            if ltitle not in exps.columns:
                print(ltitle,"column not in the input table!")
                sys.exit(0)
            newexps = exps[[fid,ltitle,ptitle]][(exps[ltitle]>=self.args.foldchange) & (exps[ptitle] <= self.args.pvalue)]
            #newexps = exps[(exps[cluster] == i) & (abs(exps[ltitle])>=self.args.foldchange) & (exps[ptitle] <= self.args.pvalue)]
            #print(newexps)
            #print(newexps)

            h_values,colnames = self.get_cell_matrix(newexps,ltitle,fid,gcol,ccol,abs_tag)
            print("Cluster " + cname + " Gene number:",newexps[fid].unique().shape[0])
            #print(colnames)
            #for x in newexps[fid].unique():
            #    print(x)
            #exit()
            if self.args.output:
                h_values['Cluster'] = cname
                Annotator.to_output(h_values,self.wb,self.args.outfmt,cname,"Cell Type")

            #print(h_values)
            #exit()
            t,o_str,c,v,times = self.print_class(h_values,cname)
            outs.append([cname,t,c,v,times])
            if self.args.noprint == False:
                print(o_str)

            otherexps = None
            ofid = 'o_n'
            oltitle = 'o_l'
            optitle = 'o_p'
            for j in list(sorted(cnum)):
                oname = str(j)
                if oname == cname:continue
                tltitle = oname + "_" + pre
                tfid = oname + "_" + rfid
                tptitle = oname + "_" + pname
                tempexps = exps[[tfid,tltitle,tptitle]][(exps[tltitle]>=self.args.foldchange) & (exps[tptitle] <= self.args.pvalue)]
                tempexps.columns = [ofid,oltitle,optitle]
                if otherexps is None:
                    otherexps = tempexps
                else:
                    otherexps = pd.concat([otherexps,tempexps])
            #otherexps = exps[(exps[cluster] != i) & (abs(exps[ltitle])>=self.args.foldchange) & (exps[ptitle] <= self.args.pvalue)]
            #print(otherexps)
            #exit()

            if self.args.target.lower() == "cellmarker":
                tfc,trownames,trownum,tcolnames,tcolnum = self.get_cell_gene_names(otherexps,self.cmarkers,ofid,gcol,ccol,'other')
                if not trownames:continue
                other_gene_names = set(tcolnames)
                self.deal_with_badtype(cname,other_gene_names,colnames)
            elif self.args.target.lower() == "cancersea":
                tfc,trownames,trownum,tcolnames,tcolnum = self.get_cell_gene_names(otherexps,self.smarkers,ofid,gcol,ccol,'other')
                if not trownames:continue
                other_gene_names = set(tcolnames)
                self.deal_with_badtype(cname,other_gene_names,colnames)
            print("Other Gene number:",len(other_gene_names))
        if self.args.output:
            self.wb.close()
            self.wbgo.close()
        if self.args.noprint == False:
            print("#"*80 + "\n")
        return outs

    def get_exp_matrix_loop(self,exps,ltitle,fid,colnames,rownames,cell_matrix,usertag,abs_tag = True):
        """format the cell_deg_matrix and calculate the zscore of certain cell types."""

        #filter gene expressed matrix according to the markers
        gene_exps = exps.loc[:,[fid,ltitle]][exps[fid].isin(colnames)]

        gene_matrix = mat(gene_exps.sort_values(fid)[ltitle]).T
        gene_matrix = gene_matrix * np.mean(gene_matrix) ### / np.min(gene_matrix))

        if gene_matrix.shape[0] != cell_matrix.shape[1]:
            print("Error for inconsistent gene numbers, please check your expression csv for '" + fid + "'")
            return None
        
        nonzero = np.matrix(np.count_nonzero(cell_matrix,axis=1)).T
        #gene_matrix = np.ones_like(gene_matrix)
        cell_deg_matrix = cell_matrix * gene_matrix

        #print("cell",cell_matrix)
        #print("gene",gene_matrix)
        #print(colnames)
        #print(rownames)

        #print(rownames)
        #exit()
        #print(gene_matrix)
        #print(cell_deg_matrix)
        #print(type(rownames))
        #a1 = "Natural killer T (NKT) cell"
        #b1 = "T cell"
        #a1 = "Macrophage"
        #b1 = "Monocyte"

        #a1 = "Mesenchymal stem cell"
        #b1 = "Fibroblast"
        #mar = cell_matrix[np.array(rownames) == a1]
        #mon = cell_matrix[np.array(rownames) == b1]
        #marz = nonzero[np.array(rownames) == a1]
        #monz = nonzero[np.array(rownames) == b1]
        #print(len(mar[np.nonzero(mar)]),len(mon[np.nonzero(mon)]))

        #print(log2(marz),log2(monz))

        #print(mar)
        #print(mon)
        #print(marz,monz)
        #print(cell_matrix,cell_matrix.shape,gene_matrix.shape)

        #print(np.std(cell_matrix,axis=1))
        #print(cell_matrix.shape,cell_deg_matrix.shape)
        wstd = np.matrix(np.std(cell_matrix,axis=1)).T
        #print(wstd.shape,wstd,nonzero)
        if usertag:
            cell_deg_matrix = np.matrix(np.array(cell_deg_matrix))
        else:
            if (wstd.shape == np.ones_like(wstd)).all:
                wstd = [[1]]
            if (nonzero == np.ones_like(nonzero)).all:
                cell_deg_matrix = np.matrix(np.array(cell_deg_matrix) * np.array(wstd))
            else:
                cell_deg_matrix = np.matrix(np.array(cell_deg_matrix) * np.array(log2(nonzero)) * np.array(wstd))

        out = DataFrame({"Z-score":cell_deg_matrix.A1},index=rownames)
        out.sort_values(['Z-score'],inplace=True,ascending=False)
        #out.to_csv("wei.sco",sep="\t")
        #print(cell_deg_matrix,wstd,log2(nonzero))

        if abs_tag:
            out['Z-score'] = abs(out['Z-score'])
        else:
            out = out[out['Z-score'] > 0]

        #print(out)
        if (out.shape[0] > 1):
            out['Z-score'] = (out['Z-score'] - mean(out['Z-score']))/std(out['Z-score'],ddof=1)
        #print(out)

        return out


    def get_cell_gene_names(self,exps,markers,fid,gcol,ccol,tag):
        """find expressed markers according to the markers and expressed matrix."""
        #print(fid)
        whole_gsets = set(exps[fid])
        if self.args.target.lower() == "cancersea":
            #whole_fil = markers['EnsembleID'].isin(whole_gsets)
            markers['weight'] = 1
            if self.args.Gensymbol == True:
                markers[gcol] = markers['GeneName']
            else:
                markers[gcol] = markers['EnsembleID']
        whole_fil = markers[gcol].isin(whole_gsets)

        fc = markers[[ccol,gcol,'weight']][whole_fil]
        #print(whole_gsets,exps[fid],exps)
        #print(markers[gcol])
        #print(fc)
        #print(list(fc['cellName'].unique()),ccol,gcol)
        #print(whole_fil.unique())
        #exit()
        #print(markers,markers.columns)
        #print(fc)
        if fc.shape[0] == 0:
            if tag != "other":
                print("!WARNING3:Zero marker sets found, type:" + tag)
                print("!WARNING3:Change the threshold or tissue name and try again?")
                print("!WARNING3:EnsemblID or GeneID,try '-E' command?")
            return fc,None,None,whole_gsets,None
        #print("helll")
        fc.columns = [ccol,gcol,'c']
        fc.set_index([ccol,gcol])
        newfc = fc.groupby([ccol,gcol]).sum()
        #print(newfc)
        #if newfc.shape[0] <1:
        #    print(newfc.shape)
        #    print(fc)
        #    print(exps)
        names = newfc.index
        #print(names)
        #print(names)
        newfc['c1'] = names
        newfc[gcol] = newfc['c1'].apply(lambda x:x[1])
        newfc[ccol] = newfc['c1'].apply(lambda x:x[0])
        newfc.drop(['c1'],inplace=True,axis=1)
        newfc.reset_index(drop=True,inplace=True)
        #print(newfc)
        #exit()
        newfc['c'] = log2(newfc['c'] + 0.05) # * np.min(newfc['c'])
        fc = newfc
        #print("hello")
        #newfc.to_csv("wei.cls",sep="\t")
        #exit()
        #print(fc['c'][fc['c'] != 0])

        rownames = sorted(set(fc[ccol].unique()))
        rownum = len(rownames)
        colnames = sorted(set(fc[gcol].unique()))
        colnum = len(colnames)
        #print(fc.shape,fc)
        return fc,rownames,rownum,colnames,colnum

    def get_user_cell_gene_names(self,exps,fid,gcol,ccol,tag):
        """find expressed markers according to the user markers and expressed matrix."""
        #print(self.usermarkers)
        self.usermarkers.columns = [ccol,gcol,'weight']
        whole_gsets = set(exps[fid])
        whole_fil = self.usermarkers[gcol].isin(whole_gsets)

        fc = self.usermarkers[[ccol,gcol,'weight']][whole_fil]
        if fc.shape[0] == 0:
            if tag != "other":
                print("!WARNING3:Zero marker sets found, type:" + tag)
                print("!WARNING3:Change the threshold or tissue name and try again?")
                print("!WARNING3:EnsemblID or GeneID,try '-E' command?")
            return fc,None,None,whole_gsets,None
        fc.columns = [ccol,gcol,'c']
        fc.set_index([ccol,gcol])
        #print("FC",fc)
        #print("ENSG00000105369" in whole_gsets)

        newfc = fc.groupby([ccol,gcol]).sum()
        #if newfc.shape[0] <1:
        #    print(newfc.shape)
        #    print(fc)
        #    print(exps)
        names = newfc.index
        #print(names)
        #print(names)
        newfc['c1'] = names
        newfc[gcol] = newfc['c1'].apply(lambda x:x[1])
        newfc[ccol] = newfc['c1'].apply(lambda x:x[0])
        newfc.drop(['c1'],inplace=True,axis=1)
        newfc.reset_index(drop=True,inplace=True)
        #print(newfc)
        #exit()
        newfc['c'] = log2(newfc['c'] + 0.05) # * np.min(newfc['c'])
        fc = newfc
        #print("hello")
        #newfc.to_csv("wei.cls",sep="\t")
        #exit()


        rownames = sorted(set(self.usermarkers[ccol].unique()))
        rownum = len(rownames)
        colnames = sorted(set(fc[gcol].unique()))
        colnum = len(colnames)

        #print(fc,rownames,colnames)
        #print(fc.shape)
        return fc,rownames,rownum,colnames,colnum

    def get_cell_matrix(self,exps,ltitle,fid,gcol,ccol,abs_tag):
        """combine cell matrix with weight-matrix"""
        cell_value = None
        colnames = None
        if not self.args.norefdb:
            cell_value,colnames =self.get_cell_matrix_detail(exps,ltitle,fid,gcol,ccol,False,abs_tag)
        if self.args.MarkerDB != None:
            if self.args.norefdb:
                cell_value,colnames =self.get_cell_matrix_detail(exps,ltitle,fid,gcol,ccol,True,abs_tag)
            else:
                cell_value,colnames =self.get_cell_matrix_detail(exps,ltitle,fid,gcol,ccol,False,abs_tag)
                user_value,user_colnames =self.get_cell_matrix_detail(exps,ltitle,fid,gcol,ccol,True,abs_tag)
                #print("C",cell_value)
                #print("U",user_value)
                if cell_value is None:
                    if user_value is None:
                        return DataFrame(),set(colnames)
                    else:
                        cell_value = user_value
                        colnames = user_colnames
                        cell_value = cell_value.join(user_value,how="outer",lsuffix="cm",rsuffix="ur")
                        cell_value[cell_value.isna()] = 0
                        colnames = colnames | user_colnames
                else:
                    if user_value is None:
                        user_value = cell_value
                        user_colnames = colnames
                    cell_value = cell_value.join(user_value,how="outer",lsuffix="cm",rsuffix="ur")
                    cell_value[cell_value.isna()] = 0
                    colnames = colnames | user_colnames

                #else:
                #    cell_value = user_value
                #    colnames = user_colnames
        #database weight-matrix
        wm = [1]
        if self.args.MarkerDB != None:
            if self.args.norefdb:
                wm = [1]
            else:
                wm =[0.1,0.9]
        weight_matrix = mat(wm).T

        if colnames is None:
            return DataFrame(),None

        if cell_value is None:
            return DataFrame(),set(colnames)

        #print(cell_value)
        last_value = array(cell_value) * weight_matrix
        result = DataFrame({"Cell Type":cell_value.index,"Z-score":last_value.A1})
        result = result.sort_values(by="Z-score",ascending = False)
        #if self.args.target == "cancersea":
        #    result['note'] = result['Cell Type'].apply(lambda x: self.snames[x])
        return result,set(colnames)

    def get_cell_matrix_detail(self,exps,ltitle,fid,gcol,ccol,usertag,abs_tag):
        """calculate the cell type scores"""
        fc,rownames,rownum,colnames,colnum = None,None,None,None,None
        #print(self.cmarkers)
        if self.args.target == "cellmarker":
            markers = self.cmarkers
        elif self.args.target == "cancersea":
            markers = self.smarkers

        #print(markers.columns)

        if usertag:
            fc,rownames,rownum,colnames,colnum = self.get_user_cell_gene_names(exps,fid,gcol,ccol,"user_marker")
            #print("FC",fc)
            #fc['c'] = 1
        else:
            fc,rownames,rownum,colnames,colnum = self.get_cell_gene_names(exps,markers,fid,gcol,ccol,'marker')
            #print(colnames)
        #print(colnames)

        if not colnames:
            return None,None
        if fc.shape[0] == 0:
            return None,set(colnames)

        exps = exps[exps[fid].isin(colnames)]

        rowdic = dict(zip(rownames,range(rownum)))
        coldic = dict(zip(colnames,range(colnum)))
        fc_cell = fc[ccol].map(lambda x:rowdic[x])
        fc_gene = fc[gcol].map(lambda x:coldic[x])

        newdf = DataFrame({ccol:fc_cell,gcol:fc_gene,"c":fc['c']})
        cell_coo_matrix = coo_matrix((newdf['c'],(newdf[ccol],newdf[gcol])),shape=(rownum,colnum))
        cell_matrix = cell_coo_matrix.toarray()

        #print(newdf)
        #print(rownames)
        #print(colnames)
        #print(cell_matrix)

        if self.args.noprint == False:
            if usertag:
                print("User Cell Num:",rownum)
                print("User Gene Num:",colnum)
                print("User Not Zero:",cell_coo_matrix.count_nonzero())
            else:
                print("Cell Num:",rownum)
                print("Gene Num:",colnum)
                print("Not Zero:",cell_coo_matrix.count_nonzero())
        cell_values = self.get_exp_matrix_loop(exps,ltitle,fid,colnames,rownames,cell_matrix,usertag,abs_tag)
        #print(cell_values)
        return cell_values,set(colnames)


    def read_user_markers(self,colname):
        """usermarker db preparation"""
        if self.args.MarkerDB != None:
            if not os.path.exists(self.args.MarkerDB):
                print("User marker database does not exists!",self.args.MarkerDB)
                sys.exit(0)
            self.usermarkers = read_csv(self.args.MarkerDB,sep="\t",header=None)
            self.usermarkers.columns=['cellName',colname]
            #self.hgncs_ensem = dict(zip(self.ensem_hgncs.values(),self.ensem_hgncs.keys()))
            if colname == "ensemblID":
                self.usermarkers[colname] = self.usermarkers[colname].map(lambda x:self.hgncs_ensem[x] if x in self.hgncs_ensem else x)
            self.usermarkers['weight'] = 1
            if self.args.noprint == False:
                print("User cells:", len(self.usermarkers['cellName'].unique()))
                print("User genes:", len(self.usermarkers[colname].unique()))

    def load_pickle_module(self,db):
        """read whole database"""
        handler = gzip.open(db,"rb")
        self.gos = load(handler)
        self.human_gofs = load(handler)
        self.mouse_gofs = load(handler)
        self.cmarkers = load(handler)
        self.smarkers = load(handler)
        self.snames = load(handler)
        self.ensem_hgncs = load(handler)
        self.ensem_mouse = load(handler)
        self.hgncs_ensem = dict(zip(self.ensem_hgncs.values(),self.ensem_hgncs.keys()))
        fil = []
        #fil = ['Cancer stem cell', 'Cancer cell']
        #print(self.cmarkers)
        #exit()
        self.cmarkers = self.cmarkers[~self.cmarkers['cellName'].isin(fil)]

        #if self.args.noprint == False:
        print("DB load:",len(self.gos),len(self.human_gofs),len(self.mouse_gofs),len(self.cmarkers),len(self.ensem_hgncs))

    def read_tissues_species(self,tissue="All",species="Human",celltype="normal"):
        """read markers according to certain tissue and certain species"""
        species = species.lower().capitalize()
        ct = celltype.lower().capitalize()
        if tissue != "All":
            self.cmarkers = self.cmarkers[self.cmarkers['tissueType'].isin([tissue])]
        if ct == "Normal":
            self.cmarkers = self.cmarkers[self.cmarkers['cellType']=="Normal cell"]
        elif ct == "Cancer":
            self.cmarkers = self.cmarkers[self.cmarkers['cellType']=="Cancer cell"]
        else:
            print("Illegal celltype. Please use \"[Normal] or [Cancer] instead.")
            exit(0)


        #self.cmarkers = self.cmarkers[self.cmarkers['cellName']!="Mesenchymal stem cell"]
        print("load markers:",len(self.cmarkers))
        self.cmarkers = self.cmarkers[self.cmarkers['speciesType'].isin([species])]
        #print(self.cmarkers)

    def get_list_tissue(self,species):
        """print tissue names"""
        species = species.lower().capitalize()
        cmarkers = self.cmarkers[self.cmarkers['speciesType'].isin([species])]
        names = list(sorted(cmarkers['tissueType'].unique()))
        print("#" * 120)
        print("-" * 120)
        print("{0:s}{1:<10s}{2:>5s}{3:<10d}".format("Species:",species,"Num:",len(names)))
        print("-" * 120)
        for i in range(0,len(names)-2,3):
            if len(names) < i + 1:
                s = "{0:3d}: {1:<40s}".format(i+1,names[i])
            elif len(names) < i + 2:
                s = "{0:3d}: {1:<35s}{2:3d}: {3:<35s}".format(i+1,names[i],i+2,names[i+1])
            else:
                s = "{0:3d}: {1:<35s}{2:3d}: {3:<35s}{4:3d}: {5:<35s}".format(i+1,names[i],i+2,names[i+1],i+3,names[i+2])
            print(s)
        print("#" * 120)




    def run_detail_cmd(self):
        """main command"""
        #self.check_db()
        if not os.path.exists(self.args.input):
            tempname = "./" + self.args.input
            if not os.path.exists(tempname):
                print(tempname)
                print("Input file does not exists!",self.args.input)
                sys.exit(0)
        print(self.args)
        if self.args.source.lower() == "cellranger":
            self.load_pickle_module(self.args.db)
            if self.args.species == "Mouse":
                self.ensem_hgncs = self.ensem_mouse
                self.human_gofs = self.mouse_gofs
            self.read_tissues_species(self.args.tissue,self.args.species,self.args.celltype)
            if self.args.Gensymbol:
                self.read_user_markers('Gene ID')
            else:
                self.read_user_markers('ensemblID')
            outs = self.calcu_cellranger_group(self.args.input,self.args.Gensymbol)
            return outs
        elif args.source.lower() == "seurat":
            self.load_pickle_module(self.args.db)
            if self.args.species == "Mouse":
                self.ensem_hgncs = self.ensem_mouse
                self.human_gofs = self.mouse_gofs
            self.read_tissues_species(self.args.tissue,self.args.species,self.args.celltype)
            if self.args.Gensymbol:
                self.read_user_markers('gene')
            else:
                self.read_user_markers('ensemblID')
            outs = self.calcu_seurat_group(self.args.input,self.args.Gensymbol)
            return outs
        elif args.source.lower() == "scanpy":
            self.load_pickle_module(self.args.db)
            if self.args.species == "Mouse":
                self.ensem_hgncs = self.ensem_mouse
                self.human_gofs = self.mouse_gofs
            self.read_tissues_species(self.args.tissue,self.args.species,self.args.celltype)
            if self.args.Gensymbol:
                self.read_user_markers('gene')
            else:
                self.read_user_markers('ensemblID')
            outs = self.calcu_scanpy_group(self.args.input,self.args.Gensymbol)
            return outs
            pass