def runBase(self, k_neighbors): wkr = self logging.info("running base") for cstrain in wkr.getStrains(): logging.info("Strain[%s] starting" % cstrain) mypws = wkr.getMyPathways() alleles = wkr.getAlleles(cstrain) wkr.initStrain(cstrain, mypws, shuffle=False) for pw in mypws: rt_cache = {} srts = wkr.genSRTs( cstrain, pw ) for a_base, a_compare in itertools.product(alleles,alleles): r_index = "%s_%s" % (pw, a_compare) base_samp = wkr.getSamplesByAllele(cstrain, a_base) comp_samp = wkr.getSamplesByAllele(cstrain,a_compare) for age, samp in base_samp: neighbors = wkr.kNearest(comp_samp, samp, age, k_neighbors) nhash = ''.join(neighbors) if nhash not in rt_cache: srt_comp = srts.loc[:,neighbors] rt = dirac.getRT(srt_comp) rt_cache[nhash] = rt else: rt = rt_cache[nhash] samp_srt = srts[samp] rms = wkr.getRMS( rt, samp_srt ) wkr.setRMS(rms, r_index, samp) c = wkr.classify() return c
def runBase(self, k_neighbors): wkr = self logging.info("running base") for cstrain in wkr.getStrains(): logging.info("Strain[%s] starting" % cstrain) mypws = wkr.getMyPathways() alleles = wkr.getAlleles(cstrain) wkr.initStrain(cstrain, mypws, shuffle=False) for pw in mypws: rt_cache = {} srts = wkr.genSRTs(cstrain, pw) for a_base, a_compare in itertools.product(alleles, alleles): r_index = "%s_%s" % (pw, a_compare) base_samp = wkr.getSamplesByAllele(cstrain, a_base) comp_samp = wkr.getSamplesByAllele(cstrain, a_compare) for age, samp in base_samp: neighbors = wkr.kNearest(comp_samp, samp, age, k_neighbors) nhash = ''.join(neighbors) if nhash not in rt_cache: srt_comp = srts.loc[:, neighbors] rt = dirac.getRT(srt_comp) rt_cache[nhash] = rt else: rt = rt_cache[nhash] samp_srt = srts[samp] rms = wkr.getRMS(rt, samp_srt) wkr.setRMS(rms, r_index, samp) c = wkr.classify() return c
def genRMS(comm,sd,mi,k_neighbors): #get and distribute pws and strains pws = None if comm.rank == 0: pws = sd.getPathways() strain_list = mi.getStrains() pws = comm.bcast(pws) strain_list = comm.bcast(strain_list) for cstrain in strain_list: logging.info('Starting strain [%s]' % cstrain) mypws = [pw for i,pw in enumerate(pws) if i%comm.size == comm.rank] alleles = mi.getNominalAlleles(cstrain) indexes = ["%s_%s" % (pw,allele) for pw,allele in itertools.product(mypws,alleles)] samples = mi.getSampleIDs(cstrain) #preallocate results dataframe results = pandas.DataFrame(np.empty((len(indexes), len(samples)), dtype=float), index=indexes, columns=samples) for pw in mypws: #partition samples by strain/allele samples = partitionSamplesByAllele( alleles, mi, cstrain) #generate pw srts for all samples partitioned by strain/allele srts = getSRTSByAllele(alleles,pw,samples) for allele_base in alleles: for allele_compare in alleles: r_index = "%s_%s" % (pw,allele_compare) #list of samples with comparison allele compare_list = samples[allele_compare] for age, samp in samples[allele_base]: samp_compare = kNearest(compare_list,samp_name, samp_age, k_neighbors) comp_exp = srts[allele_compare].loc[:,samp_compare] rt = dirac.getRT(comp_exp) results[samp][r_index] = dirac.getRMS(srts[allele_base][samp],rt) comm.barrier() return results
def runPerm(self, num_runs, k_neighbors,truth): wkr = self c_results = {} for k,v in truth.iteritems(): c_results[k] = v.copy() for i in v.index: c_results[k][i] = 0 test = [] for ctr in range(num_runs): temp = {} check = True for cstrain in wkr.getStrains(): logging.info("Strain[%s] starting" % cstrain) mypws = wkr.getMyPathways() alleles = wkr.getAlleles(cstrain) wkr.initStrain(cstrain, mypws, shuffle=True) for pw in mypws: rt_cache = {} srts = wkr.genSRTs( cstrain, pw ) for a_base, a_compare in itertools.product(alleles,alleles): r_index = "%s_%s" % (pw, a_compare) base_samp = wkr.getSamplesByAllele(cstrain, a_base) #testing shuffle temp[(pw,cstrain,a_base)] = base_samp if len(test) > 0: same = True for x in temp[(pw,cstrain,a_base)]: if x not in test[-1][(pw,cstrain,a_base)]: same = False msg = 'In runperm\n'+ ''.join(map(str, temp[(pw,cstrain,a_base)]))+ '\n and \n '+''.join( map(str,test[-1][(pw,cstrain,a_base)]) ) assert same == False, msg if check and self._comm.rank == 0: print ctr,a_base, base_samp[:5] comp_samp = wkr.getSamplesByAllele(cstrain,a_compare) for age, samp in base_samp: neighbors = wkr.kNearest(comp_samp, samp, age, k_neighbors) nhash = ''.join(neighbors) if nhash not in rt_cache: srt_comp = srts.loc[:,neighbors] rt = dirac.getRT(srt_comp) rt_cache[nhash] = rt else: rt = rt_cache[nhash] samp_srt = srts[samp] rms = wkr.getRMS( rt, samp_srt ) wkr.setRMS(rms, r_index, samp) check = False test.append(temp) c = wkr.classify() for key in c.keys(): for i in c[key].index: if truth[key][i] <= c[key][i]: msg = "key: [%s] index[%s] ctr[%i] value[%i]" %(key,i,ctr,c_results[key][i]) assert c_results[key][i] <= ctr, msg c_results[key][i] += 1 truth[key].to_pickle('/scratch/sgeadmin/unjoined.truth.perm.%s.df.%i.%i.pkl'%(key,self._comm.rank,ctr)) c[key].to_pickle('/scratch/sgeadmin/unjoined.perm.%s.df.%i.%i.pkl'%(key,self._comm.rank,ctr)) return c_results
def runPerm(self, num_runs, k_neighbors, truth): wkr = self c_results = {} for k, v in truth.iteritems(): c_results[k] = v.copy() for i in v.index: c_results[k][i] = 0 test = [] for ctr in range(num_runs): temp = {} check = True for cstrain in wkr.getStrains(): logging.info("Strain[%s] starting" % cstrain) mypws = wkr.getMyPathways() alleles = wkr.getAlleles(cstrain) wkr.initStrain(cstrain, mypws, shuffle=True) for pw in mypws: rt_cache = {} srts = wkr.genSRTs(cstrain, pw) for a_base, a_compare in itertools.product( alleles, alleles): r_index = "%s_%s" % (pw, a_compare) base_samp = wkr.getSamplesByAllele(cstrain, a_base) #testing shuffle temp[(pw, cstrain, a_base)] = base_samp if len(test) > 0: same = True for x in temp[(pw, cstrain, a_base)]: if x not in test[-1][(pw, cstrain, a_base)]: same = False msg = 'In runperm\n' + ''.join( map(str, temp[(pw, cstrain, a_base)]) ) + '\n and \n ' + ''.join( map(str, test[-1][(pw, cstrain, a_base)])) assert same == False, msg if check and self._comm.rank == 0: print ctr, a_base, base_samp[:5] comp_samp = wkr.getSamplesByAllele(cstrain, a_compare) for age, samp in base_samp: neighbors = wkr.kNearest(comp_samp, samp, age, k_neighbors) nhash = ''.join(neighbors) if nhash not in rt_cache: srt_comp = srts.loc[:, neighbors] rt = dirac.getRT(srt_comp) rt_cache[nhash] = rt else: rt = rt_cache[nhash] samp_srt = srts[samp] rms = wkr.getRMS(rt, samp_srt) wkr.setRMS(rms, r_index, samp) check = False test.append(temp) c = wkr.classify() for key in c.keys(): for i in c[key].index: if truth[key][i] <= c[key][i]: msg = "key: [%s] index[%s] ctr[%i] value[%i]" % ( key, i, ctr, c_results[key][i]) assert c_results[key][i] <= ctr, msg c_results[key][i] += 1 truth[key].to_pickle( '/scratch/sgeadmin/unjoined.truth.perm.%s.df.%i.%i.pkl' % (key, self._comm.rank, ctr)) c[key].to_pickle( '/scratch/sgeadmin/unjoined.perm.%s.df.%i.%i.pkl' % (key, self._comm.rank, ctr)) return c_results