def findBurstsInTrainIFR(train, alpha=0.95): """ Find bursts of high activity (high instantaneous firing rate) in a train of pulses with respect to a statistical threshold upQuant. Example: bursts= findBurstsInTrainIFR(train, alpha=0.05) """ nSpikes = len(train) ifr = sc.zeros(nSpikes) isi = sc.zeros(nSpikes) dIFR = sc.zeros(nSpikes) isi[1:] = train[1:] - train[:-1] #dISI[1:] = isi[1:] / ifr[1:] = 1 / (train[1:] - train[:-1]) dIFR[1:] = (ifr[1:] - ifr[:-1]) / isi[1:] ifrCDF, ifrCDFInverse = calcCDF(ifr, graph=0) isiCDF, isiCDFInverse = calcCDF(isi, graph=0) # Find spikes during high activity ifrUpThresh = calcThresholdsFromCDF(ifrCDFInverse, (alpha, )) ifrDnThresh = calcThresholdsFromCDF(ifrCDFInverse, (1 - alpha, )) isiThresh = calcThresholdsFromCDF(isiCDFInverse, (alpha, )) #rHighInds= sc.where( ifr>ifrThresh)[0] rHighInds = sc.where(ifr > ifrUpThresh)[0] lHighInds = rHighInds - 1 highInds = sc.union1d(rHighInds, lHighInds) highSpikeTimes = train[highInds] #lowSpikeTimes= train[lowInds] aa = sc.zeros(len(highInds)) aa[1:] = sc.diff(highInds) #bb= sc.zeros(len(lowInds)) #bb[1:]= sc.diff(lowInds) startInds = sc.where(aa != 1)[0] burstStarts = highSpikeTimes[startInds] burstEnds = highSpikeTimes[startInds - 1] nBursts = len(burstStarts) burstStarts = burstStarts[:-1] burstEnds = burstEnds[1:] pBurst = sc.float32(nBursts) / nSpikes burstDurs = burstEnds - burstStarts bursts = { "train": train, "highInds": highInds, "highSpikeTimes": highSpikeTimes, "burstStarts": burstStarts, "burstEnds": burstEnds, "nBursts": nBursts, "nSpikes": nSpikes, "pBurst": pBurst, "burstDurs": burstDurs, "ifr": ifr, "ifrCDF": ifrCDF, "ifrCDFInverse": ifrCDFInverse, "alpha": alpha, "ifrThresh": ifrThresh, "isi": isi, "isiCDF": isiCDF, "isiCDFInverse": isiCDFInverse, "isiThresh": isiThresh, "dIFR": dIFR } return bursts
def make_unique_by_event(event_list): # function event_list = make_unique_by_event(event_list) # # This script removes all events that share the sam alternative evnt coordinates # but differ in the flanking size. The longest of several equal events is kept. rm_idx = [] last_kept = 0 for i in range(1, event_list.shape[0]): if i % 1000 == 0: print '.', if i % 10000 == 0: print '%i' % i old_coords = event_list[last_kept].get_inner_coords(trafo=True) curr_coords = event_list[i].get_inner_coords(trafo=True) if old_coords.shape[0] == curr_coords.shape[0] and sp.all( old_coords == curr_coords): ### assertion that we did everything right assert (event_list[last_kept].chr == event_list[i].chr) assert (event_list[last_kept].strand == event_list[i].strand) ### check, which event is longer -> keep longer event len1 = event_list[last_kept].get_len() len2 = event_list[i].get_len() if len1 > len2: keep_idx = last_kept not_keep_idx = i else: keep_idx = i not_keep_idx = last_kept ### check if we would loose strains idx = sp.where(~sp.in1d(event_list[not_keep_idx].strain, event_list[keep_idx].strain))[0] if idx.shape[0] > 0: event_list[keep_idx].strain = sp.r_[ event_list[keep_idx].strain, event_list[not_keep_idx].strain[idx]] ### TODO !!!!!!!!!!!!! make sure that we keep different coordinates if the strains differ ... event_list[keep_idx].gene_name = sp.union1d( event_list[keep_idx].gene_name, event_list[not_keep_idx].gene_name) rm_idx.append(not_keep_idx) last_kept = keep_idx else: last_kept = i print 'events dropped: %i' % len(rm_idx) keep_idx = sp.where(~sp.in1d(sp.arange(event_list.shape[0]), rm_idx))[0] event_list = event_list[keep_idx] return event_list
def reduce(PSI_l, Xl, coverage_threshold): """ Computes set cover reduction to get the most relevant samples that define the class Xl. :param PSI_l: (Nl x 2) matrix containing both the scale and the shape of the weibull distribution :param Xl: (Nl x dimension_feature_vector) matrix containing the feature vectors of each instance of a class :param coverage_threshold: Probability above which we consider an instance to be not enough representative of its class :return: The indexes of the most representative samples of a class """ #This matrix D is symmetric D = ppp_cosine_similarity(Xl, Xl) # Number of instances of the class Nl = np.shape(D)[0] S = [] for i in range(Nl): Si = [] for j in range(Nl): if (psi_i_dist(D[i, j], PSI_l[i, 0], PSI_l[i, 1]) >= coverage_threshold): # Sample i is redundant with respect to j Si.append(j) S.append(Si) # Universe U = list(range(0, Nl)) # Covered index C = [] # Final indexs I = [] #Set Cover Implementation while (len(scipy.intersect1d(C, U)) != len(U)): # punct_ref is a counter to find the maximum in every iteration punct_ref = 0 # ind represent the index that we will append to our index's list ind = 0 index_s = 0 for s in S: punct = 0 relative_inclusion = scipy.isin(s, C) for eleme in relative_inclusion: if (eleme is False): punct += 1 if (punct >= punct_ref): ind = index_s index_s += 1 C = scipy.union1d(C, S[ind]) I.append(ind) S.remove(S[ind]) if (len(S) == 0): break return I
def make_unique_by_event(event_list): # function event_list = make_unique_by_event(event_list) # # This script removes all events that share the sam alternative evnt coordinates # but differ in the flanking size. The longest of several equal events is kept. rm_idx = [] last_kept = 0 for i in range(1, event_list.shape[0]): if i % 1000 == 0: print '.', if i % 10000 == 0: print '%i' % i old_coords = event_list[last_kept].get_inner_coords(trafo=True) curr_coords = event_list[i].get_inner_coords(trafo=True) if old_coords.shape[0] == curr_coords.shape[0] and sp.all(old_coords == curr_coords): ### assertion that we did everything right assert(event_list[last_kept].chr == event_list[i].chr) assert(event_list[last_kept].strand == event_list[i].strand) ### check, which event is longer -> keep longer event len1 = event_list[last_kept].get_len() len2 = event_list[i].get_len() if len1 > len2: keep_idx = last_kept not_keep_idx = i else: keep_idx = i not_keep_idx = last_kept ### check if we would loose strains idx = sp.where(~sp.in1d(event_list[not_keep_idx].strain, event_list[keep_idx].strain))[0] if idx.shape[0] > 0: event_list[keep_idx].strain = sp.r_[event_list[keep_idx].strain, event_list[not_keep_idx].strain[idx]] ### TODO !!!!!!!!!!!!! make sure that we keep different coordinates if the strains differ ... event_list[keep_idx].gene_name = sp.union1d(event_list[keep_idx].gene_name, event_list[not_keep_idx].gene_name) rm_idx.append(not_keep_idx) last_kept = keep_idx else: last_kept = i print 'events dropped: %i' % len(rm_idx) keep_idx = sp.where(~sp.in1d(sp.arange(event_list.shape[0]), rm_idx))[0] event_list = event_list[keep_idx] return event_list
def trainBursts(train, aBinSize=0.05, maxTime=1.0, alpha=0.95): """ Find bursts of high activity (high instantaneous firing rate) in a train of pulses with respect to a statistical threshold upQuant. Example: bursts= trainBursts(train, aBinSize=0.05, maxTime=1.0, alpha=0.95) """ nSpikes=len(train) isi= sc.zeros(nSpikes); dIFR= sc.zeros(nSpikes) isi[1:]= train[1:]-train[:-1] ifr=1/isi dIFR[1:]= (((ifr[1:]-ifr[:-1])/isi[1:]) + ((ifr[1:]-ifr[:-1])/isi[:-1]))/2.0 ifrCDF,ifrCDFInverse= calcCDF(ifr,graph=0) #isiCDF,isiCDFInverse= calcCDF(isi,graph=0) # Find spikes during high activity ifrUpThresh= calcThresholdsFromCDF(ifrCDFInverse, (alpha,)) ifrDnThresh= calcThresholdsFromCDF(ifrCDFInverse, (1-alpha,)) #isiThresh= calcThresholdsFromCDF(isiCDFInverse, (alpha,)) #rHighInds= sc.where( ifr>ifrThresh)[0] rHighInds= sc.where( ifr>ifrUpThresh)[0] lHighInds= rHighInds-1 highInds= sc.union1d(rHighInds,lHighInds) highSpikeTimes= train[highInds] #lowSpikeTimes= train[lowInds] aa= sc.zeros(len(highInds)) aa[1:]= sc.diff(highInds) #bb= sc.zeros(len(lowInds)) #bb[1:]= sc.diff(lowInds) startInds=sc.where(aa!=1)[0] burstStarts= highSpikeTimes[startInds] burstEnds= highSpikeTimes[startInds-1] nBursts= len(burstStarts) burstStarts=burstStarts[:-1] burstEnds=burstEnds[1:] pBurst = sc.float32(nBursts)/nSpikes burstDurs = burstEnds-burstStarts c,b= xcorr(train,train, aBinSize, maxTime, minTime=0) cHz = c/aBinSize c[0]=0.0 bursts={"train": train, "highInds":highInds, "highSpikeTimes":highSpikeTimes, "burstStarts":burstStarts, "burstEnds":burstEnds, "nBursts":nBursts, "nSpikes": nSpikes, "pBurst": pBurst, "burstDurs": burstDurs, "alpha":alpha, "ifr":ifr, "ifrCDF":ifrCDF, "ifrCDFInverse":ifrCDFInverse,"ifrThresh":ifrUpThresh, "isi":isi, #"isiCDF":isiCDF, "isiCDFInverse":isiCDFInverse,"isiThresh":isiThresh, "dIFR":dIFR, "aCorrHz":cHz, "aCorrBins":b} return bursts
def findDuplicateVectors(vec, tol=vTol, equivPM=False): """ Find vectors in an array that are equivalent to within a specified tolerance USAGE: eqv = DuplicateVectors(vec, *tol) INPUT: 1) vec is n x m, a double array of m horizontally concatenated n-dimensional vectors. *2) tol is 1 x 1, a scalar tolerance. If not specified, the default tolerance is 1e-14. *3) set equivPM to True if vec and -vec are to be treated as equivalent OUTPUT: 1) eqv is 1 x p, a list of p equivalence relationships. NOTES: Each equivalence relationship is a 1 x q vector of indices that represent the locations of duplicate columns/entries in the array vec. For example: | 1 2 2 2 1 2 7 | vec = | | | 2 3 5 3 2 3 3 | eqv = [[1x2 double] [1x3 double]], where eqv[0] = [0 4] eqv[1] = [1 3 5] """ vlen = vec.shape[1] vlen0 = vlen orid = asarray(range(vlen), dtype="int") torid = orid.copy() tvec = vec.copy() eqv = [] eqvTot = 0 uid = 0 ii = 1 while vlen > 1 and ii < vlen0: dupl = tile(tvec[:, 0], (vlen, 1)) if not equivPM: diff = abs(tvec - dupl.T).sum(0) match = abs(diff[1:]) <= tol # logical to find duplicates else: diffn = abs(tvec - dupl.T).sum(0) matchn = abs(diffn[1:]) <= tol diffp = abs(tvec + dupl.T).sum(0) matchp = abs(diffp[1:]) <= tol match = matchn + matchp kick = hstack([True, match]) # pick self too if kick.sum() > 1: eqv += [torid[kick].tolist()] eqvTot = hstack([eqvTot, torid[kick]]) uid = hstack([uid, torid[kick][0]]) cmask = ones((vlen, )) cmask[kick] = 0 cmask = cmask != 0 tvec = tvec[:, cmask] torid = torid[cmask] vlen = tvec.shape[1] ii += 1 if len(eqv) == 0: eqvTot = [] uid = [] else: eqvTot = eqvTot[1:].tolist() uid = uid[1:].tolist() # find all single-instance vectors singles = sort(setxor1d(eqvTot, range(vlen0))) # now construct list of unique vector column indices uid = int_(sort(union1d(uid, singles))).tolist() # make sure is a 1D list if not hasattr(uid, '__len__'): uid = [uid] return eqv, uid
def findDuplicateVectors(vec, tol=vTol, equivPM=False): """ Find vectors in an array that are equivalent to within a specified tolerance USAGE: eqv = DuplicateVectors(vec, *tol) INPUT: 1) vec is n x m, a double array of m horizontally concatenated n-dimensional vectors. *2) tol is 1 x 1, a scalar tolerance. If not specified, the default tolerance is 1e-14. *3) set equivPM to True if vec and -vec are to be treated as equivalent OUTPUT: 1) eqv is 1 x p, a list of p equivalence relationships. NOTES: Each equivalence relationship is a 1 x q vector of indices that represent the locations of duplicate columns/entries in the array vec. For example: | 1 2 2 2 1 2 7 | vec = | | | 2 3 5 3 2 3 3 | eqv = [[1x2 double] [1x3 double]], where eqv[0] = [0 4] eqv[1] = [1 3 5] """ vlen = vec.shape[1] vlen0 = vlen orid = asarray(range(vlen), dtype="int") torid = orid.copy() tvec = vec.copy() eqv = [] eqvTot = 0 uid = 0 ii = 1 while vlen > 1 and ii < vlen0: dupl = tile(tvec[:, 0], (vlen, 1)) if not equivPM: diff = abs(tvec - dupl.T).sum(0) match = abs(diff[1:]) <= tol # logical to find duplicates else: diffn = abs(tvec - dupl.T).sum(0) matchn = abs(diffn[1:]) <= tol diffp = abs(tvec + dupl.T).sum(0) matchp = abs(diffp[1:]) <= tol match = matchn + matchp kick = hstack([True, match]) # pick self too if kick.sum() > 1: eqv += [torid[kick].tolist()] eqvTot = hstack( [ eqvTot, torid[kick] ] ) uid = hstack( [ uid, torid[kick][0] ] ) cmask = ones((vlen,)) cmask[kick] = 0 cmask = cmask != 0 tvec = tvec[:, cmask] torid = torid[cmask] vlen = tvec.shape[1] ii += 1 if len(eqv) == 0: eqvTot = [] uid = [] else: eqvTot = eqvTot[1:].tolist() uid = uid[1:].tolist() # find all single-instance vectors singles = sort( setxor1d( eqvTot, range(vlen0) ) ) # now construct list of unique vector column indices uid = int_( sort( union1d( uid, singles ) ) ).tolist() # make sure is a 1D list if not hasattr(uid,'__len__'): uid = [uid] return eqv, uid
assert len(deltas) == conf_idx.shape[0] for i, c in enumerate(conf_idx): if i > 0 and i % 1000 == 0: sys.stdout.write('.') if i % 10000 == 0: sys.stdout.write('%i/%i\n' % (i, conf_idx.shape[0])) sys.stdout.flush() d = sp.array(deltas[i]) #if (IN['event_pos'][c, 3] - IN['event_pos'][c, 2]) % 3 != 0: if sp.sum(d[1::2] - d[::2]) % 3 != 0: k2_idx.append(i) k2_idx = sp.array(k2_idx, dtype='int') print 'flagged %i of %i events that are out of frame' % (k2_idx.shape[0], conf_idx.shape[0]) ### integrate k1 and k2 events k_idx = sp.union1d(k1_idx, k2_idx) conf_idx = conf_idx[k_idx] #pos = pos[k_idx, :] print 'retaining %i events' % (conf_idx.shape[0]) cPickle.dump(conf_idx, open(os.path.join(basedir, 'merge_graphs_%s_C%i.function_idx.cpickle' % (event_type, CONF)), 'w'), -1) #print 'loading psi' #psi = IN['psi'][:] #print 'done' # #### remove all events that have nan as PSI in more than 10% of samples #k_idx = [] #for i, c in enumerate(conf_idx): # if i > 0 and i % 1000 == 0:
print(i) spike_samples_clean = pl.delete(spike_samples_clean, 0) pl.save(os.path.join(memap_folder, 'spike_samples_clean.npy'), spike_samples_clean) channels = np.empty(0) for i in pl.arange(0, pl.size(spike_samples_clean)): data = np.array(data_probe_hp[:, spike_samples_clean[i]].tolist()) channels = np.append(channels, np.argmax(data)) if i%100==0: print(i) channels_spikes_df = pd.DataFrame([(channels, spike_samples_clean)], columns=['Channels', 'Samples']) spike_times_shaftA = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>7][channels_spikes_df.Channels[0]<16] spike_times_shaftB = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>23] spike_times_shaftD = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]<8] spike_times_shaftC = sp.setxor1d(spike_samples_clean, sp.union1d(spike_times_shaftA, sp.union1d(spike_times_shaftB, spike_times_shaftD))) pl.save(os.path.join(memap_folder, 'spike_times_shaftA.npy'), spike_times_shaftA) pl.save(os.path.join(memap_folder, 'spike_times_shaftC.npy'), spike_times_shaftC) #----------Analysis--------------------- f_ecog = f_sampling/(int(f_sampling/f_subsample)) spike_times_shaftA_ecog = np.array(spike_times_shaftA * f_ecog / f_sampling, dtype='int') spike_times_shaftC_ecog = np.array(spike_times_shaftC * f_ecog / f_sampling, dtype='int') data_ecog_lp_ss_clean = np.delete(data_ecog_lp_ss, ecog_bad_channels, axis=0) #Generate eMUA for each Shaft time_around_spike = 2
if not os.path.exists(run_dir): os.makedirs(run_dir) #load data f = h5py.File(CFG['data_file'],'r') Y = f['LogNcountsQuartz'][:] tech_noise = f['LogVar_techQuartz_logfit'][:] genes_het_bool=f['genes_heterogen'][:] # index of heterogeneous(??!??) genes geneID = f['gene_names_all'][:] # gene names cellcyclegenes_filter = SP.unique(f['ccGO_gene_indices'][:].ravel() -1) # idx of cell cycle genes cellcyclegenes_filterCB600 = f['ccCBall_gene_indices'][:].ravel() -1 # idxof cell cycle genes ... # filter cell cycle genes idx_cell_cycle = SP.union1d(cellcyclegenes_filter,cellcyclegenes_filterCB600) Ymean2 = Y.mean(0)**2>0 idx_cell_cycle_noise_filtered = SP.intersect1d(idx_cell_cycle,SP.array(SP.where(Ymean2.ravel()>0))) Ycc = Y[:,idx_cell_cycle_noise_filtered] #Fit GPLVM to data k = 1 # number of latent factors file_name = CFG['panama_file']# name of the cache file recalc = True # recalculate X and Kconf sclvm = scLVM(Y) X,Kcc,varGPLVM = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,k=1,out_dir='./cache',file_name=file_name,recalc=recalc) #3. load relevant dataset for analysis genes_het=SP.array(SP.where(f['genes_heterogen'][:].ravel()==1)) # considers only heterogeneous genes
print(i) spike_samples_clean = pl.delete(spike_samples_clean, 0) pl.save(os.path.join(memap_folder, 'spike_samples_clean.npy'), spike_samples_clean) channels = np.empty(0) for i in pl.arange(0, pl.size(spike_samples_clean)): data = np.array(data_probe_hp[:, spike_samples_clean[i]].tolist()) channels = np.append(channels, np.argmax(data)) if i%100==0: print(i) channels_spikes_df = pd.DataFrame([(channels, spike_samples_clean)], columns=['Channels', 'Samples']) spike_times_shaftA = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>7][channels_spikes_df.Channels[0]<16] spike_times_shaftB = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>23] spike_times_shaftD = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]<8] spike_times_shaftC = sp.setxor1d(spike_samples_clean, sp.union1d(spike_times_shaftA, sp.union1d(spike_times_shaftB, spike_times_shaftD))) pl.save(os.path.join(memap_folder, 'spike_times_shaftA.npy'), spike_times_shaftA) pl.save(os.path.join(memap_folder, 'spike_times_shaftC.npy'), spike_times_shaftC) # ----------Analysis--------------------- f_ecog = f_sampling/(int(f_sampling/f_subsample)) spike_times_shaftA_ecog = np.array(spike_times_shaftA * f_ecog / f_sampling, dtype='int') spike_times_shaftC_ecog = np.array(spike_times_shaftC * f_ecog / f_sampling, dtype='int') data_ecog_lp_ss_clean = np.delete(data_ecog_lp_ss, ecog_bad_channels, axis=0) # Generate eMUA for each Shaft time_around_spike = 2 time_points_around_spike = int(time_around_spike * f_sampling)
count_file_SV_maternal = os.path.join(out_SV_maternal,'chr%s_maternal' % chrom,result_file) count_file_SV_paternal = os.path.join(out_SV_paternal,'chr%s_paternal' % chrom,result_file) if ((not os.path.exists(count_file_GRCH37)) or (not os.path.exists(count_file_SNP_maternal)) or (not os.path.exists(count_file_SNP_paternal)) or (not os.path.exists(count_file_SV_maternal)) or (not os.path.exists(count_file_SV_paternal))): print "skip: %s" % element_id RV_file_exist.append([element_id,os.path.exists(count_file_GRCH37),os.path.exists(count_file_SNP_maternal),os.path.exists(count_file_SNP_paternal),os.path.exists(count_file_SV_maternal),os.path.exists(count_file_SV_paternal)]) RV_file.append([element_id,count_file_GRCH37,count_file_SNP_maternal,count_file_SNP_paternal,count_file_SV_maternal,count_file_SV_paternal]) continue #1. load lists count_GRCH37 = cPickle.load(open(count_file_GRCH37,'rb')) count_SNP_maternal = cPickle.load(open(count_file_SNP_maternal,'rb')) count_SNP_paternal = cPickle.load(open(count_file_SNP_paternal,'rb')) count_SV_maternal = cPickle.load(open(count_file_SV_maternal,'rb')) count_SV_paternal = cPickle.load(open(count_file_SV_paternal,'rb')) count_SNP = SP.union1d(count_SNP_maternal,count_SNP_paternal) count_SV = SP.union1d(count_SV_maternal,count_SV_paternal) count_intersect_GRCH37_SNP = SP.intersect1d(count_SNP,count_GRCH37) count_intersect_GRCH37_SV = SP.intersect1d(count_SV,count_GRCH37) count_intersect_SNP_SV = SP.intersect1d(count_SNP,count_SV) count_ex_GRCH37_SNP = SP.setdiff1d(count_GRCH37,count_SNP) count_ex_GRCH37_SV = SP.setdiff1d(count_GRCH37,count_SV) count_ex_SNP_GRCH37 = SP.setdiff1d(count_SNP,count_GRCH37) count_ex_SV_GRCH37 = SP.setdiff1d(count_SV,count_GRCH37) count_ex_SNP_SV = SP.setdiff1d(count_SNP,count_SV) count_ex_SV_SNP = SP.setdiff1d(count_SV,count_SNP) #store a couple of things rv = [] rv = {'element_id': element_id,'count_ref': len(count_GRCH37),'count_SNP_maternal':len(count_SNP_maternal),'count_SNP_paternal':len(count_SNP_paternal),'count_SV_maternal':len(count_SV_maternal),'count_SV_paternal':len(count_SV_paternal),'count_SNP':len(count_SNP),'count_SV':len(count_SV),'count_intersect_GRCH37_SNP':len(count_intersect_GRCH37_SNP),'count_intersect_GRCH37_SV':len(count_intersect_GRCH37_SV),'count_intersect_SNP_SV':len(count_intersect_SNP_SV),'count_ex_GRCH37_SNP':len(count_ex_GRCH37_SNP),'count_ex_GRCH37_SV':len(count_ex_GRCH37_SV),'count_ex_SNP_GRCH37':len(count_ex_SNP_GRCH37),'count_ex_SV_GRCH37':len(count_ex_SV_GRCH37),'count_ex_SNP_SV':len(count_ex_SNP_SV),'count_ex_SV_SNP':len(count_ex_SV_SNP)}
def load_data(CFG, is_Ens=True, gene_set='GOCB', het_only = True, het_onlyCB=True, pairs=False, filter_median = True, combine=False, filter_expressed = 0): f = h5py.File(CFG['train_file'],'r') Y = f['LogNcountsMmus'][:] labels = f['labels'][:].ravel() futil = h5py.File(CFG['util_file'],'r') Y_util = futil['LogNcountsQuartz'][:] ftst = h5py.File(CFG['test_file'],'r') if is_Ens ==True: genes = f['EnsIds'][:] genes_util = futil['gene_names_all'][:] else: genes = SP.char.lower(f['sym_names'][:]) genes_util = SP.char.lower(futil['sym_namesQ'][:]) #test file labels_util = futil['phase_vecS'][:]*2+futil['phase_vecG2M'][:]*3+futil['phase_vecG1'][:] if CFG['util_file']==CFG['test_file']: genes_tst = genes_util YT = ftst['LogNcountsQuartz'][:] labels_tst = ftst['phase_vecS'][:]*2+ftst['phase_vecG2M'][:]*3+ftst['phase_vecG1'][:] elif is_Ens == False: ftst = h5py.File(CFG['test_file'],'r') YT = ftst['counts'][:] genes_tst = SP.char.lower(ftst['sym_names'][:]) #genes_tst = ftst['ensIds'][:] #labels_tst = SP.array([1,1,1,1,1])#ftst['labels'][:].ravel() labels_tst = ftst['labels'][:].ravel() elif is_Ens == True: ftst = h5py.File(CFG['test_file'],'r') YT = ftst['counts'][:] #genes_tst = ftst['sym_names'][:] genes_tst = ftst['ensIds'][:] #labels_tst = SP.array([1,1,1,1,1])#ftst['labels'][:].ravel() labels_tst = ftst['labels'][:].ravel() if 'class_labels' in ftst.keys(): class_labels = ftst['class_labels'][:] else: class_labels = [i.astype('str') for i in labels_tst] class_labels = SP.sort(SP.unique(class_labels)) heterogen_util = genes_util[SP.intersect1d(SP.where(Y_util.mean(0)>0)[0],SP.where(futil['genes_heterogen'][:]==1)[0])] heterogen_train = genes[SP.intersect1d(SP.where(Y.mean(0)>0)[0],SP.where(f['genes_heterogen'][:]==1)[0])] cellcyclegenes_GO = genes[SP.unique(f['cellcyclegenes_filter'][:].ravel() -1)] # idx of cell cycle genes cellcyclegenes_CB = genes[f['ccCBall_gene_indices'][:].ravel() -1] # idxof cell cycle genes ... if SP.any(gene_set=='GOCB'): cc_ens = SP.union1d(cellcyclegenes_GO,cellcyclegenes_CB) elif SP.any(gene_set=='GO'): cc_ens = cellcyclegenes_GO elif SP.any(gene_set=='CB'): cc_ens = cellcyclegenes_CB elif SP.any(gene_set=='all'): cc_ens = genes else: #assert(gene_set in CFG.keys()), str(gene_set+' does not exist. Chose different gene set.') cc_ens = gene_set if het_only==True: cc_ens = SP.intersect1d(cc_ens, heterogen_train) if pairs==True: Y = Y[:,SP.where(f['genes_heterogen'][:]==1)[0]] genes = genes[SP.where(f['genes_heterogen'][:]==1)[0]] if het_onlyCB==True: cc_ens = SP.intersect1d(cc_ens, heterogen_util) #filter_expressed = .2 lod = 0 if filter_expressed>0: medY = SP.sum(Y>lod,0)*1.0 idx_filter = (medY/SP.float_(Y.shape[0]))>filter_expressed Y = Y[:,idx_filter] genes = genes[idx_filter] #medY_tst = SP.sum(Y_tst>lod,0) #Y_tst = Y_tst[:,medY_tst>filter_expressed] #genes_tst = genes_tst[medY_tst>filter_expressed] medY_util = SP.sum(Y_util>lod,0) idx_filter = (medY_util/SP.float_(Y_util.shape[0]))>filter_expressed Y_util = Y_util[:,idx_filter] genes_util = genes_util[idx_filter] cc_ens = SP.intersect1d(cc_ens, genes) cc_ens = SP.intersect1d(cc_ens, genes_tst) cc_ens = SP.intersect1d(cc_ens, genes_util) if combine==True: genes = list(genes) genes_util = list(genes_util) genes_intersect = SP.intersect1d(genes,genes_util) cidx_tr = [ genes.index(x) for x in genes_intersect ] cidx_util = [genes_util.index(x) for x in genes_intersect] genes = SP.array(genes)[cidx_tr] genes_util = SP.array(genes_util)[cidx_util] Y = SP.vstack([Y[:,cidx_tr],Y_util[:,cidx_util]]) genes = genes_intersect labels = SP.hstack([labels, labels_util]) Y_tst = YT cc_data = {} cc_data['cc_ens'] = cc_ens cc_data['labels_tst'] = labels_tst cc_data['labels'] = labels cc_data['genes_tst'] = genes_tst cc_data['genes'] = genes cc_data['Y'] = Y cc_data['Y_test'] = Y_tst cc_data['class_labels'] = class_labels return cc_data
KG2M = SP.zeros((Y.shape[0],Y.shape[0])) for iph in range(Y.shape[0]): for jph in range(Y.shape[0]): if SP.bitwise_and(phase_vec[iph]==phase_vec[jph], phase_vec[iph]==3): KG2M[iph,jph]=1 #intra-phase variations in cell size sfCellSize = SP.log10(f['ratioEndo'][:]) sfCellSize -= sfCellSize.mean() sfCellSize = sfCellSize.reshape(1,sfCellSize.shape[0]) Ksize = SP.dot(sfCellSize.transpose(), sfCellSize) Ksize /= Ksize.diagonal().mean() # filter cell cycle genes idx_cell_cycle = SP.union1d(cellcyclegenes_filter,cellcyclegenes_filterCB600) Ymean2 = Y.mean(0)**2>0 idx_cell_cycle_noise_filtered = SP.intersect1d(idx_cell_cycle,SP.array(SP.where(Ymean2.ravel()>0))) Ycc = Y[:,idx_cell_cycle_noise_filtered] #Fit GPLVM to data k = 1 # number of latent factors file_name = CFG['panama_file']# name of the cache file recalc = True # recalculate X and Kconf sclvm = scLVM(Y) pdb.set_trace() X,Kcc,varGPLVM = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,k=1,out_dir='./cache',file_name=file_name,recalc=recalc) #3. load relevant dataset for analysis genes_het=SP.array(SP.where(f['genes_heterogen'][:].ravel()==1)) tech_noise=f['LogVar_techMmus'][:]
def _setup_for_IP(self): r""" Determines cluster labelling and condition for completion """ self._clock_start = misc.tic() self._logger.debug( '+='*25) self._logger.debug( 'INITIAL SETUP (STEP 1)') # if empty, add Pc_entry to throat_properties tdia = self._net['throat.'+self._throat_diameter_name] # calculate Pc_entry from diameters try: self['throat.inv_Pc'] = self._phase['throat.'+self._capillary_pressure_name] except: self._logger.error('Capillary pressure not assigned to '+self._phase.name) if self._timing: # calculate Volume_coef for each throat self._Tvol_coef = tdia*tdia*tdia*np.pi/12/self['throat.inv_Pc'] # Creating an array for invaded Pores(Np long, 0 for uninvaded, cluster number for inaveded) self['pore.cluster_final'] = 0 self['pore.cluster_original'] = 0 # Creating an array for invaded throats(Nt long, 0 for uninvaded, cluster number for inaveded) self['throat.cluster_final'] = 0 # Creating arrays for tracking invaded Pores(Np long, 0 for uninvaded, sequence for inaveded) self['pore.inv_seq'] =0 if self._timing: # Creating arrays for tracking invaded Pores(Np long, -1 for uninvaded, simulation time for inaveded) self['pore.inv_time'] = -1. # Creating arrays for tracking invaded throats(Nt long, 0 for uninvaded, sequence for inaveded) self['throat.inv_seq'] = 0 if self._timing: # Creating arrays for tracking invaded Pores(Np long, -1 for uninvaded, simulation time for inaveded) self['throat.inv_time'] = -1. # Iterator variables for sequences and cluster numbers clusterNumber = 1 # Determine how many clusters there are self._clusterCount = 0 for i in self._inlets: self._clusterCount += 1 # Storage for cluster information self._cluster_data = {} if self._timing: self._cluster_data['flow_rate'] = np.ones((self._clusterCount),dtype=float)*self._inlet_flow self._cluster_data['haines_pressure'] = np.zeros((self._clusterCount),dtype=float) self._cluster_data['haines_time'] = np.zeros((self._clusterCount),dtype=float) self._cluster_data['vol_coef'] = np.zeros((self._clusterCount),dtype=float) self._cluster_data['cap_volume'] = np.zeros((self._clusterCount),dtype=float) self._cluster_data['pore_volume'] = np.zeros((self._clusterCount),dtype=float) self._cluster_data['throat_volume'] = np.zeros((self._clusterCount),dtype=float) self._cluster_data['haines_throat'] = np.zeros((self._clusterCount),dtype=int) self._cluster_data['active'] = np.ones((self._clusterCount),dtype=int) self._cluster_data['transform'] = np.zeros((self._clusterCount),dtype=int) for i in range(self._clusterCount): self._cluster_data['transform'][i] = i+1 # Creating an empty list to store the list of potential throats for invasion in each cluster. # its length is equal to the maximum number of possible clusters. self._tlists = [[] for i in self._inlets] # Creating a list for each cluster to store both potential throat and corresponding throat value self._tpoints = [[] for i in self._inlets] # Initializing invasion percolation for each possible cluster self._pore_volumes = self._net['pore.'+self._pore_volume_name] self._throat_volumes = self._net['throat.'+self._throat_volume_name] for pores in self._inlets: if sp.shape(pores) == (): pores = [pores] # Label all invaded pores with their cluster self['pore.cluster_original'][pores] = clusterNumber # Label all inlet pores as invaded self['pore.inv_seq'][pores] = self._tseq if self._timing: self['pore.inv_time'][pores] = self._sim_time # Find all throats that border invaded pores interface_throat_numbers = self._net.find_neighbor_throats(pores) self.cluster_update(clusterNumber,pores,[],interface_throat_numbers) clusterNumber += 1 if self._timing: self._logger.debug( 'pore volumes') self._logger.debug(self._cluster_data['pore_volume']) self._logger.debug( 'cap volumes') self._logger.debug( self._cluster_data['cap_volume']) self._logger.debug( 'haines_throats') self._logger.debug( self._cluster_data['haines_throat']) self._tseq += 1 self._pseq += 1 self._current_cluster = 0 # Calculate the distance between the inlet and outlet pores self._outlet_position = np.average(self._net.get_data(prop='coords',pores='all')[self._outlets],0) if any([sp.shape(i) > () for i in self._inlets]): inlets = [] for i in self._inlets: inlets = sp.union1d(inlets,i) inlets = sp.array(inlets,int) else: inlets = self._inlets inlet_position = np.average(self._net.get_data(prop='coords',pores='all')[inlets],0) dist_sqrd = (self._outlet_position-inlet_position)*(self._outlet_position-inlet_position) self._initial_distance = np.sqrt(dist_sqrd[0]+dist_sqrd[1]+dist_sqrd[2]) self._logger.debug( 'initial distance') self._logger.debug( self._initial_distance) self._current_distance = self._initial_distance self._percent_complete = np.round((self._initial_distance-self._current_distance)/self._initial_distance*100, decimals = 1) self._logger.info( 'percent complete') self._logger.info( self._percent_complete) self._rough_complete = 0 print(' IP algorithm at',np.int(self._rough_complete),'% completion at',np.round(misc.toc(quiet=True)),'seconds') self._logger.debug( '+='*25)
UniqGenes = [] TraitSetAtAlpha = [] for i in xrange(len(Traits)): GeneSetAtAlpha = DataDict[Traits[i]]['GeneSetAtAlpha_'+str(Alpha)] NTotalGenesOfAlpha += len(GeneSetAtAlpha) UniqGenes.extend(GeneSetAtAlpha) if(len(GeneSetAtAlpha)>0): TraitSetAtAlpha.append(Traits[i]) TraitSetAtAlpha = scipy.array(TraitSetAtAlpha) GWIntersection = scipy.intersect1d(ar1=TraitSetAtAlpha, ar2=GWSignTraits, assume_unique=False) GWMWIntersection = scipy.intersect1d(ar1=TraitSetAtAlpha, ar2=GWMWSignTraits, assume_unique=False) GWUnion = scipy.union1d(ar1=TraitSetAtAlpha, ar2=GWSignTraits) GWMWUnion = scipy.union1d(ar1=TraitSetAtAlpha, ar2=GWMWSignTraits) fw.write(str(Alpha)+'\t'+\ str(NTotalGenesOfAlpha)+'\t'+\ str(len(scipy.unique(scipy.array(UniqGenes))))+'\t'+\ str(len(TraitSetAtAlpha))+'\t'+\ str(len(GWSignTraits))+'\t'+\ str(len(GWIntersection))+'\t'+\ str(float(len(GWIntersection))/float(len(GWUnion)))+'\t'+\ str(len(TraitSetAtAlpha))+'\t'+\ str(len(GWMWSignTraits))+'\t'+\ str(len(GWMWIntersection))+'\t'+\ str(float(len(GWMWIntersection))/float(len(GWMWUnion)))+'\n') fw.close()
def _setup_for_IP(self): r""" Determines cluster labelling and condition for completion """ self._clock_start = misc.tic() logger.debug( '+='*25) logger.debug( 'INITIAL SETUP (STEP 1)') # if empty, add Pc_entry to throat_properties tdia = self._net['throat.'+self._throat_diameter_name] # calculate Pc_entry from diameters try: self['throat.inv_Pc'] = self._phase['throat.'+self._capillary_pressure_name] except: logger.error('Capillary pressure not assigned to invading phase '+self._phase.name +', check for capillary pressure in defending phase '+self._phase_def.name +' instead') try: self['throat.inv_Pc'] = self._phase_def['throat.'+self._capillary_pressure_name] self._phase['throat.'+self._capillary_pressure_name] = self._phase_def['throat.'+self._capillary_pressure_name] except: logger.error('Capillary pressure neither assigned to defending phase '+self._phase_def.name +' nor to invading phase '+self._phase.name) pass if self._timing: # calculate Volume_coef for each throat self._Tvol_coef = tdia*tdia*tdia*np.pi/12/self['throat.inv_Pc'] # Creating an array for invaded Pores(Np long, 0 for uninvaded, cluster number for inaveded) self['pore.cluster_final'] = 0 self['pore.cluster_original'] = 0 # Creating an array for invaded throats(Nt long, 0 for uninvaded, cluster number for inaveded) self['throat.cluster_final'] = 0 # Creating arrays for tracking invaded Pores(Np long, 0 for uninvaded, sequence for inaveded) self['pore.inv_seq'] =0 # Creating arrays for tracking invaded Pores(Np long, 0 for uninvaded, pressure for inaveded) self['pore.inv_pres'] =0 if self._timing: # Creating arrays for tracking invaded Pores(Np long, -1 for uninvaded, simulation time for inaveded) self['pore.inv_time'] = -1. # Creating arrays for tracking invaded throats(Nt long, 0 for uninvaded, sequence for inaveded) self['throat.inv_seq'] = 0 # Creating arrays for tracking invaded throats(Nt long, 0 for uninvaded, pressure for inaveded) self['throat.inv_pres'] = 0 if self._timing: # Creating arrays for tracking invaded Pores(Np long, -1 for uninvaded, simulation time for inaveded) self['throat.inv_time'] = -1. # Iterator variables for sequences and cluster numbers clusterNumber = 1 # Determine how many clusters there are self._clusterCount = 0 for i in self._inlets: self._clusterCount += 1 # Storage for cluster information self._cluster_data = {} if self._timing: self._cluster_data['flow_rate'] = np.ones((self._clusterCount),dtype=float)*self._inlet_flow self._cluster_data['haines_pressure'] = np.zeros((self._clusterCount),dtype=float) self._cluster_data['haines_time'] = np.zeros((self._clusterCount),dtype=float) self._cluster_data['vol_coef'] = np.zeros((self._clusterCount),dtype=float) self._cluster_data['cap_volume'] = np.zeros((self._clusterCount),dtype=float) self._cluster_data['pore_volume'] = np.zeros((self._clusterCount),dtype=float) self._cluster_data['throat_volume'] = np.zeros((self._clusterCount),dtype=float) self._cluster_data['haines_throat'] = np.zeros((self._clusterCount),dtype=int) self._cluster_data['active'] = np.ones((self._clusterCount),dtype=int) self._cluster_data['transform'] = np.zeros((self._clusterCount),dtype=int) for i in range(self._clusterCount): self._cluster_data['transform'][i] = i+1 # Creating an empty list to store the list of potential throats for invasion in each cluster. # its length is equal to the maximum number of possible clusters. self._tlists = [[] for i in self._inlets] # Creating a list for each cluster to store both potential throat and corresponding throat value self._tpoints = [[] for i in self._inlets] # Initializing invasion percolation for each possible cluster self._pore_volumes = self._net['pore.'+self._pore_volume_name] self._throat_volumes = self._net['throat.'+self._throat_volume_name] for pores in self._inlets: if sp.shape(pores) == (): pores = [pores] # Label all invaded pores with their cluster self['pore.cluster_original'][pores] = clusterNumber # Label all inlet pores as invaded self['pore.inv_seq'][pores] = self._tseq self['pore.inv_pres'][pores] = 0 if self._timing: self['pore.inv_time'][pores] = self._sim_time # Find all throats that border invaded pores interface_throat_numbers = self._net.find_neighbor_throats(pores) self.cluster_update(clusterNumber,pores,[],interface_throat_numbers) clusterNumber += 1 if self._timing: logger.debug( 'pore volumes') logger.debug(self._cluster_data['pore_volume']) logger.debug( 'cap volumes') logger.debug( self._cluster_data['cap_volume']) pass logger.debug( 'haines_throats') logger.debug( self._cluster_data['haines_throat']) self._tseq += 1 self._pseq += 1 self._current_cluster = 0 # Calculate the distance between the inlet and outlet pores self._outlet_position = np.average(self._net['pore.coords'][self._outlets],0) if any([sp.shape(i) > () for i in self._inlets]): inlets = [] for i in self._inlets: inlets = sp.union1d(inlets,i) inlets = sp.array(inlets,int) else: inlets = self._inlets inlet_position = np.average(self._net['pore.coords'][inlets],0) dist_sqrd = (self._outlet_position-inlet_position)*(self._outlet_position-inlet_position) self._initial_distance = np.sqrt(dist_sqrd[0]+dist_sqrd[1]+dist_sqrd[2]) logger.debug( 'initial distance') logger.debug( self._initial_distance) self._current_distance = self._initial_distance self._percent_complete = np.round((self._initial_distance-self._current_distance)/self._initial_distance*100, decimals = 1) logger.info( 'percent complete') logger.info( self._percent_complete) self._rough_complete = 0 print(' IP algorithm at',np.int(self._rough_complete),'% completion at',np.round(misc.toc(quiet=True)),'seconds') logger.debug( '+='*25)