def mageckmle_main(pvargs=None, parsedargs=None, returndict=False): ''' Main entry for MAGeCK MLE ---- Parameters: pvargs Arguments for parsing returndict If set true, will not try to run the whole prediction process, but will return after mean variance modeling ''' # parsing arguments if parsedargs is not None: args = parsedargs else: args = mageckmle_parseargs(pvargs) args = mageckmle_postargs(args) # Bayes module if hasattr(args, 'bayes') and args.bayes: from mlemageck_bayes import mageck_bayes_main sys.exit(0) # comment this when you think bayes module is completed mageck_bayes_main(parsedargs=args) sys.exit(0) # # from mleclassdef import * # from mledesignmat import * # from mleem import * # from mleinstanceio import * # from mlemeanvar import * import scipy from scipy.stats import nbinom import numpy as np import numpy.linalg as linalg from mageck.mleinstanceio import read_gene_from_file, write_gene_to_file, write_sgrna_to_file from mageck.mleem import iteratenbem from mageck.mlemeanvar import MeanVarModel from mageck.mageckCount import normalizeCounts from mageck.mlesgeff import read_sgrna_eff, sgrna_eff_initial_guess from dispersion_characterization import sgrna_wide_dispersion_estimation_MAP_v2 from mageck.mlemultiprocessing import runem_multiproc, iteratenbem_permutation # main process maxfittinggene = args.genes_varmodeling maxgene = np.inf # reading sgRNA efficiency read_sgrna_eff(args) # reading read count table allgenedict = read_gene_from_file(args.count_table, includesamples=args.include_samples) # sgrna_eff_initial_guess(args, allgenedict) # calculate the size factor cttab_sel = {} for (geneid, gk) in allgenedict.iteritems(): sgid = gk.sgrnaid sgreadmat = gk.nb_count.getT().tolist() for i in range(len(sgid)): cttab_sel[sgid[i]] = sgreadmat[i] if hasattr(args, 'norm_method'): if args.norm_method != 'none': size_f = normalizeCounts(cttab_sel, method=args.norm_method, returnfactor=True, reversefactor=True, controlsgfile=args.control_sgrna) else: size_f = None else: size_f = normalizeCounts(cttab_sel, returnfactor=True, reversefactor=True) if size_f != None: logging.info('size factor: ' + ','.join([str(x) for x in size_f])) # desmat=np.matrix([[1,1,1,1],[0,0,1,0],[0,0,0,1]]).getT() desmat = args.design_matrix ngene = 0 for (tgid, tginst) in allgenedict.iteritems(): tginst.design_mat = desmat meanvardict = {} for (tgid, tginst) in allgenedict.iteritems(): #iteratenbem(tginst,debug=False,alpha_val=0.01,estimateeff=False,size_factor=size_f) ##sgrna_wide_dispersion_estimation_MAP_v2(tginst,tginst.design_mat) ngene += 1 tginst.w_estimate = [] meanvardict[tgid] = tginst if ngene > maxfittinggene: break argsdict = { 'debug': False, 'alpha_val': 0.01, 'estimateeff': False, 'size_factor': size_f } runem_multiproc(meanvardict, args, nproc=args.threads, argsdict=argsdict) for (tgid, tginst) in meanvardict.iteritems(): allgenedict[tgid] = tginst # model the mean and variance logging.info('Modeling the mean and variance ...') if maxfittinggene > 0: mrm = MeanVarModel() # old: linear model mrm.get_mean_var_residule(allgenedict) mrm.model_mean_var_by_lm() # new method: generalized linear model #mrm.model_mean_disp_by_glm(allgenedict,args.output_prefix,size_f) else: mrm = None if returndict: return (allgenedict, mrm, size_f) # run the test again... logging.info('Run the algorithm for the second time ...') if hasattr(args, 'threads') and args.threads > 1: # multipel threads argsdict = { 'debug': False, 'estimateeff': True, 'meanvarmodel': mrm, 'restart': False, 'removeoutliers': args.remove_outliers, 'size_factor': size_f, 'updateeff': args.update_efficiency, 'logem': False } runem_multiproc(allgenedict, args, nproc=args.threads, argsdict=argsdict) else: # only 1 thread ngene = 0 for (tgid, tginst) in allgenedict.iteritems(): #try: if ngene % 1000 == 1 or args.debug: logging.info('Calculating ' + tgid + ' (' + str(ngene) + ') ... ') if hasattr( args, 'debug_gene' ) and args.debug_gene != None and tginst.prefix != args.debug_gene: continue iteratenbem(tginst, debug=False, estimateeff=True, meanvarmodel=mrm, restart=False, removeoutliers=args.remove_outliers, size_factor=size_f, updateeff=args.update_efficiency) # Tracer()() ngene += 1 if ngene > maxgene: break #except: # logging.error('Error occurs while calculating beta values of gene '+tgid+'.') # sys.exit(-1) # set up the w vector for (tgid, tginst) in allgenedict.iteritems(): if len(tginst.w_estimate) == 0: tginst.w_estimate = np.ones(len(tginst.sgrnaid)) #Tracer()() # permutation iteratenbem_permutation(allgenedict, args, nround=args.permutation_round, removeoutliers=args.remove_outliers, size_factor=size_f) # correct for FDR from mageck.mleclassdef import gene_fdr_correction gene_fdr_correction(allgenedict, args.adjust_method) # write to file genefile = args.output_prefix + '.gene_summary.txt' sgrnafile = args.output_prefix + '.sgrna_summary.txt' logging.info('Writing gene results to ' + genefile) logging.info('Writing sgRNA results to ' + sgrnafile) write_gene_to_file(allgenedict, genefile, betalabels=args.beta_labels) write_sgrna_to_file(allgenedict, sgrnafile) return (allgenedict, mrm)
def mageckmle_main(pvargs=None,parsedargs=None,returndict=False): ''' Main entry for MAGeCK MLE ---- Parameters: pvargs Arguments for parsing returndict If set true, will not try to run the whole prediction process, but will return after mean variance modeling ''' # parsing arguments if parsedargs is not None: args=parsedargs else: args=mageckmle_parseargs(pvargs) args=mageckmle_postargs(args) # Bayes module if hasattr(args,'bayes') and args.bayes: from mlemageck_bayes import mageck_bayes_main sys.exit(0) # comment this when you think bayes module is completed mageck_bayes_main(parsedargs=args) sys.exit(0) # # from mleclassdef import * # from mledesignmat import * # from mleem import * # from mleinstanceio import * # from mlemeanvar import * import scipy from scipy.stats import nbinom import numpy as np import numpy.linalg as linalg from mageck.mleinstanceio import read_gene_from_file,write_gene_to_file,write_sgrna_to_file from mageck.mleem import iteratenbem from mageck.mlemeanvar import MeanVarModel from mageck.mageckCount import normalizeCounts from mageck.mlesgeff import read_sgrna_eff,sgrna_eff_initial_guess from dispersion_characterization import sgrna_wide_dispersion_estimation_MAP_v2 from mageck.mlemultiprocessing import runem_multiproc,iteratenbem_permutation # main process maxfittinggene=args.genes_varmodeling maxgene=np.inf # reading sgRNA efficiency read_sgrna_eff(args) # reading read count table allgenedict=read_gene_from_file(args.count_table,includesamples=args.include_samples) # sgrna_eff_initial_guess(args,allgenedict) # calculate the size factor cttab_sel={} for (geneid,gk) in allgenedict.iteritems(): sgid=gk.sgrnaid sgreadmat=gk.nb_count.getT().tolist() for i in range(len(sgid)): cttab_sel[sgid[i]]=sgreadmat[i] if hasattr(args,'norm_method'): if args.norm_method!='none': size_f=normalizeCounts(cttab_sel,method=args.norm_method,returnfactor=True,reversefactor=True,controlsgfile=args.control_sgrna) else: size_f=None else: size_f=normalizeCounts(cttab_sel,returnfactor=True,reversefactor=True) if size_f !=None: logging.info('size factor: '+','.join([str(x) for x in size_f])) # desmat=np.matrix([[1,1,1,1],[0,0,1,0],[0,0,0,1]]).getT() desmat=args.design_matrix ngene=0 for (tgid,tginst) in allgenedict.iteritems(): tginst.design_mat=desmat meanvardict={} for (tgid,tginst) in allgenedict.iteritems(): #iteratenbem(tginst,debug=False,alpha_val=0.01,estimateeff=False,size_factor=size_f) ##sgrna_wide_dispersion_estimation_MAP_v2(tginst,tginst.design_mat) ngene+=1 tginst.w_estimate=[] meanvardict[tgid]=tginst if ngene>maxfittinggene: break argsdict={'debug':False, 'alpha_val':0.01, 'estimateeff':False,'size_factor':size_f} runem_multiproc(meanvardict,args,nproc=args.threads,argsdict=argsdict) for (tgid,tginst) in meanvardict.iteritems(): allgenedict[tgid]=tginst # model the mean and variance logging.info('Modeling the mean and variance ...') if maxfittinggene>0: mrm=MeanVarModel() # old: linear model mrm.get_mean_var_residule(allgenedict) mrm.model_mean_var_by_lm() # new method: generalized linear model #mrm.model_mean_disp_by_glm(allgenedict,args.output_prefix,size_f) else: mrm=None if returndict: return (allgenedict,mrm,size_f) # run the test again... logging.info('Run the algorithm for the second time ...') if hasattr(args,'threads') and args.threads>1: # multipel threads argsdict={'debug':False,'estimateeff':True,'meanvarmodel':mrm,'restart':False,'removeoutliers':args.remove_outliers,'size_factor':size_f,'updateeff':args.update_efficiency,'logem':False} runem_multiproc(allgenedict,args,nproc=args.threads,argsdict=argsdict) else: # only 1 thread ngene=0 for (tgid,tginst) in allgenedict.iteritems(): #try: if ngene % 1000 ==1 or args.debug: logging.info('Calculating '+tgid+' ('+str(ngene)+') ... ') if hasattr(args,'debug_gene') and args.debug_gene!=None and tginst.prefix != args.debug_gene: continue iteratenbem(tginst,debug=False,estimateeff=True,meanvarmodel=mrm,restart=False,removeoutliers=args.remove_outliers,size_factor=size_f,updateeff=args.update_efficiency) # Tracer()() ngene+=1 if ngene>maxgene: break #except: # logging.error('Error occurs while calculating beta values of gene '+tgid+'.') # sys.exit(-1) # set up the w vector for (tgid,tginst) in allgenedict.iteritems(): if len(tginst.w_estimate)==0: tginst.w_estimate=np.ones(len(tginst.sgrnaid)) #Tracer()() # permutation iteratenbem_permutation(allgenedict,args,nround=args.permutation_round,removeoutliers=args.remove_outliers,size_factor=size_f) # correct for FDR from mageck.mleclassdef import gene_fdr_correction; gene_fdr_correction(allgenedict,args.adjust_method); # write to file genefile=args.output_prefix+'.gene_summary.txt' sgrnafile=args.output_prefix+'.sgrna_summary.txt' logging.info('Writing gene results to '+genefile) logging.info('Writing sgRNA results to '+sgrnafile) write_gene_to_file(allgenedict,genefile,betalabels=args.beta_labels) write_sgrna_to_file(allgenedict,sgrnafile) return (allgenedict,mrm)
def mageckmle_main(pvargs=None, parsedargs=None, returndict=False): ''' Main entry for MAGeCK MLE ---- Parameters: pvargs Arguments for parsing returndict If set true, will not try to run the whole prediction process, but will return after mean variance modeling ''' # parsing arguments if parsedargs is not None: args = parsedargs else: args = mageckmle_parseargs(pvargs) args = mageckmle_postargs(args) # Bayes module if hasattr(args, 'bayes') and args.bayes: from mlemageck_bayes import mageck_bayes_main sys.exit(0) # comment this when you think bayes module is completed mageck_bayes_main(parsedargs=args) sys.exit(0) # # from mleclassdef import * # from mledesignmat import * # from mleem import * # from mleinstanceio import * # from mlemeanvar import * import scipy from scipy.stats import nbinom import numpy as np import numpy.linalg as linalg from mageck.mleinstanceio import read_gene_from_file, write_gene_to_file, write_sgrna_to_file from mageck.mleem import iteratenbem from mageck.mlemeanvar import MeanVarModel from mageck.mageckCount import normalizeCounts from mageck.mlesgeff import read_sgrna_eff, sgrna_eff_initial_guess from mageck.dispersion_characterization import sgrna_wide_dispersion_estimation_MAP_v2 from mageck.mlemultiprocessing import runem_multiproc, iteratenbem_permutation, iteratenbem_permutation_by_nsg from mageck.cnv_normalization import read_CNVdata, match_sgrnaCN, betascore_piecewisenorm, betascore_piecewisenorm from mageck.cnv_estimation import mageckmleCNVestimation # main process maxfittinggene = args.genes_varmodeling maxgene = np.inf # reading sgRNA efficiency read_sgrna_eff(args) # reading read count table allgenedict = read_gene_from_file(args.count_table, includesamples=args.include_samples) # sgrna_eff_initial_guess(args, allgenedict) # # # # calculate the size factor cttab_sel = {} for (geneid, gk) in allgenedict.items(): sgid = gk.sgrnaid sgreadmat = gk.nb_count.getT().tolist() for i in range(len(sgid)): cttab_sel[sgid[i]] = sgreadmat[i] if hasattr(args, 'norm_method'): if args.norm_method != 'none': size_f = normalizeCounts(cttab_sel, method=args.norm_method, returnfactor=True, reversefactor=True, controlsgfile=args.control_sgrna) else: size_f = None else: size_f = normalizeCounts(cttab_sel, returnfactor=True, reversefactor=True) if size_f != None: logging.info('size factor: ' + ','.join([str(x) for x in size_f])) # desmat=np.matrix([[1,1,1,1],[0,0,1,0],[0,0,0,1]]).getT() desmat = args.design_matrix ngene = 0 for (tgid, tginst) in allgenedict.items(): tginst.design_mat = desmat # # # # # perform copy number estimation if option selected if args.cnv_est is not None: logging.info('Performing copy number estimation...') # organize sgRNA-gene pairing into dictionary sgrna2genelist = { sgrna: gene for gene in allgenedict for sgrna in allgenedict[gene].sgrnaid } # estimate CNV and write results to file mageckmleCNVestimation(args.cnv_est, cttab_sel, desmat, sgrna2genelist, args.beta_labels[1:], args.output_prefix) # # # # # run the EM for a few genes to perform gene fitting process meanvardict = {} for (tgid, tginst) in allgenedict.items(): #iteratenbem(tginst,debug=False,alpha_val=0.01,estimateeff=False,size_factor=size_f) ##sgrna_wide_dispersion_estimation_MAP_v2(tginst,tginst.design_mat) ngene += 1 tginst.w_estimate = [] meanvardict[tgid] = tginst if ngene > maxfittinggene: break argsdict = { 'debug': False, 'alpha_val': 0.01, 'estimateeff': False, 'size_factor': size_f } runem_multiproc(meanvardict, args, nproc=args.threads, argsdict=argsdict) for (tgid, tginst) in meanvardict.items(): allgenedict[tgid] = tginst # # # # # model the mean and variance logging.info('Modeling the mean and variance ...') if maxfittinggene > 0: mrm = MeanVarModel() # old: linear model mrm.get_mean_var_residule(allgenedict) mrm.model_mean_var_by_lm() # new method: generalized linear model #mrm.model_mean_disp_by_glm(allgenedict,args.output_prefix,size_f) else: mrm = None if returndict: return (allgenedict, mrm, size_f) # run the test again... logging.info('Run the algorithm for the second time ...') if hasattr(args, 'threads') and args.threads > 0: # multipel threads argsdict = { 'debug': False, 'estimateeff': True, 'meanvarmodel': mrm, 'restart': False, 'removeoutliers': args.remove_outliers, 'size_factor': size_f, 'updateeff': args.update_efficiency, 'logem': False } runem_multiproc(allgenedict, args, nproc=args.threads, argsdict=argsdict) else: # only 1 thread # the following codes should be merged to the above code section ngene = 0 for (tgid, tginst) in allgenedict.items(): #try: if ngene % 1000 == 1 or args.debug: logging.info('Calculating ' + tgid + ' (' + str(ngene) + ') ... ') if hasattr( args, 'debug_gene' ) and args.debug_gene != None and tginst.prefix != args.debug_gene: continue iteratenbem(tginst, debug=False, estimateeff=True, meanvarmodel=mrm, restart=False, removeoutliers=args.remove_outliers, size_factor=size_f, updateeff=args.update_efficiency) # Tracer()() ngene += 1 if ngene > maxgene: break #except: # logging.error('Error occurs while calculating beta values of gene '+tgid+'.') # sys.exit(-1) # set up the w vector for (tgid, tginst) in allgenedict.items(): if len(tginst.w_estimate) == 0: tginst.w_estimate = np.ones(len(tginst.sgrnaid)) #Tracer()() # permutation, either by group or together if args.no_permutation_by_group: iteratenbem_permutation(allgenedict, args, nround=args.permutation_round, removeoutliers=args.remove_outliers, size_factor=size_f) else: iteratenbem_permutation_by_nsg(allgenedict, args, size_f=size_f) # correct for FDR from mageck.mleclassdef import gene_fdr_correction gene_fdr_correction(allgenedict, args.adjust_method) # correct for CNV if args.cnv_norm is not None or args.cnv_est is not None: if args.cnv_norm is not None: # get copy number data from external copy number dataset logging.info('Performing copy number normalization.') (CN_arr, CN_celldict, CN_genedict) = read_CNVdata(args.cnv_norm, args.beta_labels[1:]) genes2correct = False # do not select only subset of genes to correct (i.e. correct all genes) elif args.cnv_est is not None: # get copy number data from copy number estimates calculated earlier logging.info( 'Performing copy number normalization using copy number estimates.' ) (CN_arr, CN_celldict, CN_genedict) = read_CNVdata( str(args.output_prefix) + 'CNVestimates.txt', args.beta_labels[1:]) genes2correct = highestCNVgenes(CN_arr, CN_genedict, percentile=98) for i in range(len(args.beta_labels[1:])): if args.beta_labels[1:][i] not in CN_celldict: logging.warning( args.beta_labels[1:][i] + ' is not represented in the inputted copy number variation data.' ) else: logging.info('Normalizing by copy number with ' + args.beta_labels[1:][i] + ' as the reference cell line.') betascore_piecewisenorm(allgenedict, args.beta_labels, CN_arr, CN_celldict, CN_genedict, selectGenes=genes2correct) # write to file genefile = args.output_prefix + '.gene_summary.txt' sgrnafile = args.output_prefix + '.sgrna_summary.txt' logging.info('Writing gene results to ' + genefile) logging.info('Writing sgRNA results to ' + sgrnafile) write_gene_to_file(allgenedict, genefile, args, betalabels=args.beta_labels) write_sgrna_to_file(allgenedict, sgrnafile) return (allgenedict, mrm)