def findBurstsInTrainIFR(train, alpha=0.95):
    """
    Find bursts of high activity (high instantaneous firing rate) in a train of pulses with respect to a statistical threshold upQuant. Example:
    bursts= findBurstsInTrainIFR(train, alpha=0.05)
    """
    nSpikes = len(train)
    ifr = sc.zeros(nSpikes)
    isi = sc.zeros(nSpikes)
    dIFR = sc.zeros(nSpikes)
    isi[1:] = train[1:] - train[:-1]
    #dISI[1:] = isi[1:] /
    ifr[1:] = 1 / (train[1:] - train[:-1])
    dIFR[1:] = (ifr[1:] - ifr[:-1]) / isi[1:]
    ifrCDF, ifrCDFInverse = calcCDF(ifr, graph=0)
    isiCDF, isiCDFInverse = calcCDF(isi, graph=0)
    # Find spikes during high activity
    ifrUpThresh = calcThresholdsFromCDF(ifrCDFInverse, (alpha, ))
    ifrDnThresh = calcThresholdsFromCDF(ifrCDFInverse, (1 - alpha, ))
    isiThresh = calcThresholdsFromCDF(isiCDFInverse, (alpha, ))
    #rHighInds= sc.where( ifr>ifrThresh)[0]
    rHighInds = sc.where(ifr > ifrUpThresh)[0]
    lHighInds = rHighInds - 1
    highInds = sc.union1d(rHighInds, lHighInds)
    highSpikeTimes = train[highInds]
    #lowSpikeTimes= train[lowInds]
    aa = sc.zeros(len(highInds))
    aa[1:] = sc.diff(highInds)
    #bb= sc.zeros(len(lowInds))
    #bb[1:]= sc.diff(lowInds)
    startInds = sc.where(aa != 1)[0]
    burstStarts = highSpikeTimes[startInds]
    burstEnds = highSpikeTimes[startInds - 1]
    nBursts = len(burstStarts)
    burstStarts = burstStarts[:-1]
    burstEnds = burstEnds[1:]
    pBurst = sc.float32(nBursts) / nSpikes
    burstDurs = burstEnds - burstStarts
    bursts = {
        "train": train,
        "highInds": highInds,
        "highSpikeTimes": highSpikeTimes,
        "burstStarts": burstStarts,
        "burstEnds": burstEnds,
        "nBursts": nBursts,
        "nSpikes": nSpikes,
        "pBurst": pBurst,
        "burstDurs": burstDurs,
        "ifr": ifr,
        "ifrCDF": ifrCDF,
        "ifrCDFInverse": ifrCDFInverse,
        "alpha": alpha,
        "ifrThresh": ifrThresh,
        "isi": isi,
        "isiCDF": isiCDF,
        "isiCDFInverse": isiCDFInverse,
        "isiThresh": isiThresh,
        "dIFR": dIFR
    }
    return bursts
Example #2
0
def make_unique_by_event(event_list):
    # function event_list = make_unique_by_event(event_list)
    #
    # This script removes all events that share the sam alternative evnt coordinates
    # but differ in the flanking size. The longest of several equal events is kept.

    rm_idx = []
    last_kept = 0
    for i in range(1, event_list.shape[0]):
        if i % 1000 == 0:
            print '.',
            if i % 10000 == 0:
                print '%i' % i

        old_coords = event_list[last_kept].get_inner_coords(trafo=True)
        curr_coords = event_list[i].get_inner_coords(trafo=True)

        if old_coords.shape[0] == curr_coords.shape[0] and sp.all(
                old_coords == curr_coords):

            ### assertion that we did everything right
            assert (event_list[last_kept].chr == event_list[i].chr)
            assert (event_list[last_kept].strand == event_list[i].strand)

            ### check, which event is longer -> keep longer event
            len1 = event_list[last_kept].get_len()
            len2 = event_list[i].get_len()

            if len1 > len2:
                keep_idx = last_kept
                not_keep_idx = i
            else:
                keep_idx = i
                not_keep_idx = last_kept

            ### check if we would loose strains
            idx = sp.where(~sp.in1d(event_list[not_keep_idx].strain,
                                    event_list[keep_idx].strain))[0]
            if idx.shape[0] > 0:
                event_list[keep_idx].strain = sp.r_[
                    event_list[keep_idx].strain,
                    event_list[not_keep_idx].strain[idx]]
                ### TODO !!!!!!!!!!!!! make sure that we keep different coordinates if the strains differ ...
                event_list[keep_idx].gene_name = sp.union1d(
                    event_list[keep_idx].gene_name,
                    event_list[not_keep_idx].gene_name)

            rm_idx.append(not_keep_idx)
            last_kept = keep_idx
        else:
            last_kept = i

    print 'events dropped: %i' % len(rm_idx)
    keep_idx = sp.where(~sp.in1d(sp.arange(event_list.shape[0]), rm_idx))[0]
    event_list = event_list[keep_idx]

    return event_list
Example #3
0
def reduce(PSI_l, Xl, coverage_threshold):
    """
    Computes set cover reduction to get the most relevant samples that define the class Xl.
    :param PSI_l: (Nl x 2) matrix containing both the scale and the shape of the weibull distribution
    :param Xl: (Nl x dimension_feature_vector) matrix containing the feature vectors of each instance of a class
    :param coverage_threshold: Probability above which we consider an instance to be not enough representative of its class
    :return: The indexes of the most representative samples of a class
    """
    #This matrix D is symmetric
    D = ppp_cosine_similarity(Xl, Xl)
    # Number of instances of the class
    Nl = np.shape(D)[0]

    S = []
    for i in range(Nl):
        Si = []
        for j in range(Nl):
            if (psi_i_dist(D[i, j], PSI_l[i, 0], PSI_l[i, 1]) >=
                    coverage_threshold):
                # Sample i is redundant with respect to j
                Si.append(j)
        S.append(Si)
    # Universe
    U = list(range(0, Nl))
    # Covered index
    C = []
    # Final indexs
    I = []

    #Set Cover Implementation
    while (len(scipy.intersect1d(C, U)) != len(U)):
        # punct_ref is a counter to find the maximum in every iteration
        punct_ref = 0
        # ind represent the index that we will append to our index's list
        ind = 0
        index_s = 0
        for s in S:
            punct = 0
            relative_inclusion = scipy.isin(s, C)
            for eleme in relative_inclusion:
                if (eleme is False):
                    punct += 1
            if (punct >= punct_ref):
                ind = index_s
            index_s += 1

        C = scipy.union1d(C, S[ind])
        I.append(ind)
        S.remove(S[ind])
        if (len(S) == 0):
            break
    return I
Example #4
0
def make_unique_by_event(event_list):
    # function event_list = make_unique_by_event(event_list)
    #
    # This script removes all events that share the sam alternative evnt coordinates
    # but differ in the flanking size. The longest of several equal events is kept.

    rm_idx = []
    last_kept = 0
    for i in range(1, event_list.shape[0]):
        if i % 1000 == 0:
            print '.',
            if i % 10000 == 0:
                print '%i' % i
        
        old_coords = event_list[last_kept].get_inner_coords(trafo=True)
        curr_coords = event_list[i].get_inner_coords(trafo=True) 

        if old_coords.shape[0] == curr_coords.shape[0] and sp.all(old_coords == curr_coords):

            ### assertion that we did everything right
            assert(event_list[last_kept].chr == event_list[i].chr)
            assert(event_list[last_kept].strand == event_list[i].strand)
            
            ### check, which event is longer -> keep longer event
            len1 = event_list[last_kept].get_len()
            len2 = event_list[i].get_len()

            if len1 > len2:
                keep_idx = last_kept
                not_keep_idx = i
            else:
                keep_idx = i
                not_keep_idx = last_kept

            ### check if we would loose strains 
            idx = sp.where(~sp.in1d(event_list[not_keep_idx].strain, event_list[keep_idx].strain))[0]
            if idx.shape[0] > 0:
                event_list[keep_idx].strain = sp.r_[event_list[keep_idx].strain, event_list[not_keep_idx].strain[idx]]
                ### TODO !!!!!!!!!!!!! make sure that we keep different coordinates if the strains differ ...
                event_list[keep_idx].gene_name = sp.union1d(event_list[keep_idx].gene_name, event_list[not_keep_idx].gene_name)

            rm_idx.append(not_keep_idx)
            last_kept = keep_idx
        else:
            last_kept = i

    print 'events dropped: %i' % len(rm_idx)
    keep_idx = sp.where(~sp.in1d(sp.arange(event_list.shape[0]), rm_idx))[0]
    event_list = event_list[keep_idx]

    return event_list
Example #5
0
def trainBursts(train, aBinSize=0.05, maxTime=1.0, alpha=0.95):
    """
    Find bursts of high activity (high instantaneous firing rate) in a train of pulses with respect to a statistical threshold upQuant. Example:
    bursts= trainBursts(train, aBinSize=0.05, maxTime=1.0, alpha=0.95)
    """
    nSpikes=len(train)
    isi= sc.zeros(nSpikes); 
    dIFR= sc.zeros(nSpikes)
    isi[1:]= train[1:]-train[:-1]
    ifr=1/isi
    dIFR[1:]= (((ifr[1:]-ifr[:-1])/isi[1:]) + ((ifr[1:]-ifr[:-1])/isi[:-1]))/2.0
    ifrCDF,ifrCDFInverse= calcCDF(ifr,graph=0)
    #isiCDF,isiCDFInverse= calcCDF(isi,graph=0)
    # Find spikes during high activity
    ifrUpThresh= calcThresholdsFromCDF(ifrCDFInverse, (alpha,))
    ifrDnThresh= calcThresholdsFromCDF(ifrCDFInverse, (1-alpha,))
    #isiThresh= calcThresholdsFromCDF(isiCDFInverse, (alpha,))
    #rHighInds= sc.where( ifr>ifrThresh)[0]
    rHighInds= sc.where( ifr>ifrUpThresh)[0]
    lHighInds= rHighInds-1
    highInds= sc.union1d(rHighInds,lHighInds)
    highSpikeTimes= train[highInds]
    #lowSpikeTimes= train[lowInds]
    aa= sc.zeros(len(highInds))
    aa[1:]= sc.diff(highInds)
    #bb= sc.zeros(len(lowInds))
    #bb[1:]= sc.diff(lowInds)
    startInds=sc.where(aa!=1)[0]
    burstStarts= highSpikeTimes[startInds]
    burstEnds= highSpikeTimes[startInds-1]
    nBursts= len(burstStarts)
    burstStarts=burstStarts[:-1]
    burstEnds=burstEnds[1:]    
    pBurst = sc.float32(nBursts)/nSpikes
    burstDurs = burstEnds-burstStarts
    c,b= xcorr(train,train, aBinSize, maxTime, minTime=0)
    cHz = c/aBinSize
    c[0]=0.0
    bursts={"train": train, 
        "highInds":highInds, "highSpikeTimes":highSpikeTimes, 
        "burstStarts":burstStarts, "burstEnds":burstEnds, 
        "nBursts":nBursts, "nSpikes": nSpikes, "pBurst": pBurst, "burstDurs": burstDurs,
        "alpha":alpha,
        "ifr":ifr, "ifrCDF":ifrCDF, "ifrCDFInverse":ifrCDFInverse,"ifrThresh":ifrUpThresh,
        "isi":isi, #"isiCDF":isiCDF, "isiCDFInverse":isiCDFInverse,"isiThresh":isiThresh,
        "dIFR":dIFR, 
        "aCorrHz":cHz, "aCorrBins":b}
    return bursts
Example #6
0
def findDuplicateVectors(vec, tol=vTol, equivPM=False):
    """
    Find vectors in an array that are equivalent to within
    a specified tolerance

      USAGE:

          eqv = DuplicateVectors(vec, *tol)

      INPUT:

          1) vec is n x m, a double array of m horizontally concatenated
                           n-dimensional vectors.
         *2) tol is 1 x 1, a scalar tolerance.  If not specified, the default
                           tolerance is 1e-14.
         *3) set equivPM to True if vec and -vec are to be treated as equivalent

      OUTPUT:

          1) eqv is 1 x p, a list of p equivalence relationships.

      NOTES:

          Each equivalence relationship is a 1 x q vector of indices that
          represent the locations of duplicate columns/entries in the array
          vec.  For example:

                | 1     2     2     2     1     2     7 |
          vec = |                                       |
                | 2     3     5     3     2     3     3 |

          eqv = [[1x2 double]    [1x3 double]], where

          eqv[0] = [0  4]
          eqv[1] = [1  3  5]
    """

    vlen = vec.shape[1]
    vlen0 = vlen
    orid = asarray(range(vlen), dtype="int")

    torid = orid.copy()
    tvec = vec.copy()

    eqv = []
    eqvTot = 0
    uid = 0

    ii = 1
    while vlen > 1 and ii < vlen0:
        dupl = tile(tvec[:, 0], (vlen, 1))

        if not equivPM:
            diff = abs(tvec - dupl.T).sum(0)
            match = abs(diff[1:]) <= tol  # logical to find duplicates
        else:
            diffn = abs(tvec - dupl.T).sum(0)
            matchn = abs(diffn[1:]) <= tol
            diffp = abs(tvec + dupl.T).sum(0)
            matchp = abs(diffp[1:]) <= tol
            match = matchn + matchp

        kick = hstack([True, match])  # pick self too

        if kick.sum() > 1:
            eqv += [torid[kick].tolist()]
            eqvTot = hstack([eqvTot, torid[kick]])
            uid = hstack([uid, torid[kick][0]])

        cmask = ones((vlen, ))
        cmask[kick] = 0
        cmask = cmask != 0

        tvec = tvec[:, cmask]

        torid = torid[cmask]

        vlen = tvec.shape[1]

        ii += 1

    if len(eqv) == 0:
        eqvTot = []
        uid = []
    else:
        eqvTot = eqvTot[1:].tolist()
        uid = uid[1:].tolist()

    # find all single-instance vectors
    singles = sort(setxor1d(eqvTot, range(vlen0)))

    # now construct list of unique vector column indices
    uid = int_(sort(union1d(uid, singles))).tolist()
    # make sure is a 1D list
    if not hasattr(uid, '__len__'):
        uid = [uid]

    return eqv, uid
Example #7
0
def findDuplicateVectors(vec, tol=vTol, equivPM=False):
    """
    Find vectors in an array that are equivalent to within
    a specified tolerance

      USAGE:

          eqv = DuplicateVectors(vec, *tol)

      INPUT:

          1) vec is n x m, a double array of m horizontally concatenated
                           n-dimensional vectors.
         *2) tol is 1 x 1, a scalar tolerance.  If not specified, the default
                           tolerance is 1e-14.
         *3) set equivPM to True if vec and -vec are to be treated as equivalent

      OUTPUT:

          1) eqv is 1 x p, a list of p equivalence relationships.

      NOTES:

          Each equivalence relationship is a 1 x q vector of indices that
          represent the locations of duplicate columns/entries in the array
          vec.  For example:

                | 1     2     2     2     1     2     7 |
          vec = |                                       |
                | 2     3     5     3     2     3     3 |

          eqv = [[1x2 double]    [1x3 double]], where

          eqv[0] = [0  4]
          eqv[1] = [1  3  5]
    """

    vlen  = vec.shape[1]
    vlen0 = vlen
    orid  = asarray(range(vlen), dtype="int")

    torid = orid.copy()
    tvec  = vec.copy()

    eqv    = []
    eqvTot = 0
    uid    = 0

    ii = 1
    while vlen > 1 and ii < vlen0:
        dupl = tile(tvec[:, 0], (vlen, 1))

        if not equivPM:
            diff  = abs(tvec - dupl.T).sum(0)
            match = abs(diff[1:]) <= tol    # logical to find duplicates
        else:
            diffn  = abs(tvec - dupl.T).sum(0)
            matchn = abs(diffn[1:]) <= tol
            diffp  = abs(tvec + dupl.T).sum(0)
            matchp = abs(diffp[1:]) <= tol
            match = matchn + matchp

        kick = hstack([True, match])    # pick self too

        if kick.sum() > 1:
            eqv    += [torid[kick].tolist()]
            eqvTot  = hstack( [ eqvTot, torid[kick] ] )
            uid     = hstack( [ uid, torid[kick][0] ] )

        cmask       = ones((vlen,))
        cmask[kick] = 0
        cmask       = cmask != 0

        tvec  = tvec[:, cmask]

        torid = torid[cmask]

        vlen = tvec.shape[1]

        ii += 1

    if len(eqv) == 0:
        eqvTot = []
        uid    = []
    else:
        eqvTot = eqvTot[1:].tolist()
        uid    = uid[1:].tolist()

    # find all single-instance vectors
    singles = sort( setxor1d( eqvTot, range(vlen0) ) )

    # now construct list of unique vector column indices
    uid = int_( sort( union1d( uid, singles ) ) ).tolist()
    # make sure is a 1D list
    if not hasattr(uid,'__len__'):
        uid = [uid]

    return eqv, uid
assert len(deltas) == conf_idx.shape[0]
for i, c in enumerate(conf_idx):
    if i > 0 and i % 1000 == 0:
        sys.stdout.write('.')
        if i % 10000 == 0:
            sys.stdout.write('%i/%i\n' % (i, conf_idx.shape[0]))
        sys.stdout.flush()
    d = sp.array(deltas[i])
    #if (IN['event_pos'][c, 3] - IN['event_pos'][c, 2]) % 3 != 0:
    if sp.sum(d[1::2] - d[::2]) % 3 != 0:
        k2_idx.append(i)
k2_idx = sp.array(k2_idx, dtype='int')
print 'flagged %i of %i events that are out of frame' % (k2_idx.shape[0], conf_idx.shape[0])

### integrate k1 and k2 events
k_idx = sp.union1d(k1_idx, k2_idx)

conf_idx = conf_idx[k_idx]
#pos = pos[k_idx, :]
print 'retaining %i events' % (conf_idx.shape[0])

cPickle.dump(conf_idx, open(os.path.join(basedir, 'merge_graphs_%s_C%i.function_idx.cpickle' % (event_type, CONF)), 'w'), -1)

#print 'loading psi'
#psi = IN['psi'][:]
#print 'done'
#
#### remove all events that have nan as PSI in more than 10% of samples
#k_idx = []
#for i, c in enumerate(conf_idx):
#    if i > 0 and i % 1000 == 0:
        print(i)
spike_samples_clean = pl.delete(spike_samples_clean, 0)
pl.save(os.path.join(memap_folder, 'spike_samples_clean.npy'), spike_samples_clean)

channels = np.empty(0)
for i in pl.arange(0, pl.size(spike_samples_clean)):
    data = np.array(data_probe_hp[:, spike_samples_clean[i]].tolist())
    channels = np.append(channels, np.argmax(data))
    if i%100==0:
        print(i)
channels_spikes_df = pd.DataFrame([(channels, spike_samples_clean)], columns=['Channels', 'Samples'])

spike_times_shaftA = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>7][channels_spikes_df.Channels[0]<16]
spike_times_shaftB = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>23]
spike_times_shaftD = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]<8]
spike_times_shaftC = sp.setxor1d(spike_samples_clean, sp.union1d(spike_times_shaftA, sp.union1d(spike_times_shaftB, spike_times_shaftD)))

pl.save(os.path.join(memap_folder, 'spike_times_shaftA.npy'), spike_times_shaftA)
pl.save(os.path.join(memap_folder, 'spike_times_shaftC.npy'), spike_times_shaftC)


#----------Analysis---------------------
f_ecog = f_sampling/(int(f_sampling/f_subsample))
spike_times_shaftA_ecog = np.array(spike_times_shaftA * f_ecog / f_sampling, dtype='int')
spike_times_shaftC_ecog = np.array(spike_times_shaftC * f_ecog / f_sampling, dtype='int')
data_ecog_lp_ss_clean = np.delete(data_ecog_lp_ss, ecog_bad_channels, axis=0)



#Generate eMUA for each Shaft
time_around_spike = 2
Example #10
0
	if not os.path.exists(run_dir):
		os.makedirs(run_dir)

	#load data
	f = h5py.File(CFG['data_file'],'r')
	Y = f['LogNcountsQuartz'][:]
	tech_noise = f['LogVar_techQuartz_logfit'][:]
	genes_het_bool=f['genes_heterogen'][:]	 # index of heterogeneous(??!??) genes
	geneID = f['gene_names_all'][:]			# gene names
	cellcyclegenes_filter = SP.unique(f['ccGO_gene_indices'][:].ravel() -1) # idx of cell cycle genes
	cellcyclegenes_filterCB600 = f['ccCBall_gene_indices'][:].ravel() -1		# idxof cell cycle genes ...
   

	# filter cell cycle genes
	idx_cell_cycle = SP.union1d(cellcyclegenes_filter,cellcyclegenes_filterCB600)
	Ymean2 = Y.mean(0)**2>0
	idx_cell_cycle_noise_filtered = SP.intersect1d(idx_cell_cycle,SP.array(SP.where(Ymean2.ravel()>0)))
	Ycc = Y[:,idx_cell_cycle_noise_filtered]
	
	#Fit GPLVM to data 
	k = 1					 # number of latent factors
	file_name = CFG['panama_file']# name of the cache file
	recalc = True # recalculate X and Kconf
	sclvm = scLVM(Y)
	X,Kcc,varGPLVM = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,k=1,out_dir='./cache',file_name=file_name,recalc=recalc)

	#3. load relevant dataset for analysis
	genes_het=SP.array(SP.where(f['genes_heterogen'][:].ravel()==1))

   # considers only heterogeneous genes
Example #11
0
        print(i)
spike_samples_clean = pl.delete(spike_samples_clean, 0)
pl.save(os.path.join(memap_folder, 'spike_samples_clean.npy'), spike_samples_clean)

channels = np.empty(0)
for i in pl.arange(0, pl.size(spike_samples_clean)):
    data = np.array(data_probe_hp[:, spike_samples_clean[i]].tolist())
    channels = np.append(channels, np.argmax(data))
    if i%100==0:
        print(i)
channels_spikes_df = pd.DataFrame([(channels, spike_samples_clean)], columns=['Channels', 'Samples'])

spike_times_shaftA = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>7][channels_spikes_df.Channels[0]<16]
spike_times_shaftB = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>23]
spike_times_shaftD = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]<8]
spike_times_shaftC = sp.setxor1d(spike_samples_clean, sp.union1d(spike_times_shaftA, sp.union1d(spike_times_shaftB, spike_times_shaftD)))

pl.save(os.path.join(memap_folder, 'spike_times_shaftA.npy'), spike_times_shaftA)
pl.save(os.path.join(memap_folder, 'spike_times_shaftC.npy'), spike_times_shaftC)


# ----------Analysis---------------------
f_ecog = f_sampling/(int(f_sampling/f_subsample))
spike_times_shaftA_ecog = np.array(spike_times_shaftA * f_ecog / f_sampling, dtype='int')
spike_times_shaftC_ecog = np.array(spike_times_shaftC * f_ecog / f_sampling, dtype='int')
data_ecog_lp_ss_clean = np.delete(data_ecog_lp_ss, ecog_bad_channels, axis=0)


# Generate eMUA for each Shaft
time_around_spike = 2
time_points_around_spike = int(time_around_spike * f_sampling)
        count_file_SV_maternal = os.path.join(out_SV_maternal,'chr%s_maternal' % chrom,result_file)
        count_file_SV_paternal = os.path.join(out_SV_paternal,'chr%s_paternal' % chrom,result_file)

        if ((not os.path.exists(count_file_GRCH37)) or (not os.path.exists(count_file_SNP_maternal)) or (not os.path.exists(count_file_SNP_paternal)) or (not os.path.exists(count_file_SV_maternal)) or (not os.path.exists(count_file_SV_paternal))):
            print "skip: %s" % element_id
            RV_file_exist.append([element_id,os.path.exists(count_file_GRCH37),os.path.exists(count_file_SNP_maternal),os.path.exists(count_file_SNP_paternal),os.path.exists(count_file_SV_maternal),os.path.exists(count_file_SV_paternal)])
            RV_file.append([element_id,count_file_GRCH37,count_file_SNP_maternal,count_file_SNP_paternal,count_file_SV_maternal,count_file_SV_paternal])
            continue
        #1. load lists
        count_GRCH37 = cPickle.load(open(count_file_GRCH37,'rb'))
        count_SNP_maternal = cPickle.load(open(count_file_SNP_maternal,'rb'))
        count_SNP_paternal = cPickle.load(open(count_file_SNP_paternal,'rb'))
        count_SV_maternal = cPickle.load(open(count_file_SV_maternal,'rb'))
        count_SV_paternal = cPickle.load(open(count_file_SV_paternal,'rb'))
        
        count_SNP = SP.union1d(count_SNP_maternal,count_SNP_paternal)
        count_SV = SP.union1d(count_SV_maternal,count_SV_paternal)
        count_intersect_GRCH37_SNP  = SP.intersect1d(count_SNP,count_GRCH37)
        count_intersect_GRCH37_SV  = SP.intersect1d(count_SV,count_GRCH37)
        count_intersect_SNP_SV  = SP.intersect1d(count_SNP,count_SV)

        count_ex_GRCH37_SNP = SP.setdiff1d(count_GRCH37,count_SNP)
        count_ex_GRCH37_SV = SP.setdiff1d(count_GRCH37,count_SV)
        count_ex_SNP_GRCH37 = SP.setdiff1d(count_SNP,count_GRCH37)
        count_ex_SV_GRCH37 = SP.setdiff1d(count_SV,count_GRCH37)
        count_ex_SNP_SV = SP.setdiff1d(count_SNP,count_SV)
        count_ex_SV_SNP = SP.setdiff1d(count_SV,count_SNP)
    
        #store a couple of things
        rv = []
        rv = {'element_id': element_id,'count_ref': len(count_GRCH37),'count_SNP_maternal':len(count_SNP_maternal),'count_SNP_paternal':len(count_SNP_paternal),'count_SV_maternal':len(count_SV_maternal),'count_SV_paternal':len(count_SV_paternal),'count_SNP':len(count_SNP),'count_SV':len(count_SV),'count_intersect_GRCH37_SNP':len(count_intersect_GRCH37_SNP),'count_intersect_GRCH37_SV':len(count_intersect_GRCH37_SV),'count_intersect_SNP_SV':len(count_intersect_SNP_SV),'count_ex_GRCH37_SNP':len(count_ex_GRCH37_SNP),'count_ex_GRCH37_SV':len(count_ex_GRCH37_SV),'count_ex_SNP_GRCH37':len(count_ex_SNP_GRCH37),'count_ex_SV_GRCH37':len(count_ex_SV_GRCH37),'count_ex_SNP_SV':len(count_ex_SNP_SV),'count_ex_SV_SNP':len(count_ex_SV_SNP)}
Example #13
0
def load_data(CFG, is_Ens=True, gene_set='GOCB', het_only = True, het_onlyCB=True, pairs=False, filter_median = True, combine=False, filter_expressed = 0):
	f = h5py.File(CFG['train_file'],'r')
	Y = f['LogNcountsMmus'][:]
	labels = f['labels'][:].ravel()
	
	futil = h5py.File(CFG['util_file'],'r')
	Y_util = futil['LogNcountsQuartz'][:]
	
	ftst = h5py.File(CFG['test_file'],'r')
	if is_Ens ==True:
		genes = f['EnsIds'][:]
		genes_util = futil['gene_names_all'][:]
	else:
		genes = SP.char.lower(f['sym_names'][:])
		genes_util = SP.char.lower(futil['sym_namesQ'][:])

	#test file
	labels_util = futil['phase_vecS'][:]*2+futil['phase_vecG2M'][:]*3+futil['phase_vecG1'][:]
	if CFG['util_file']==CFG['test_file']:
		genes_tst = genes_util 
		YT = ftst['LogNcountsQuartz'][:]
		labels_tst = ftst['phase_vecS'][:]*2+ftst['phase_vecG2M'][:]*3+ftst['phase_vecG1'][:]
	elif is_Ens == False:
		ftst = h5py.File(CFG['test_file'],'r')
		YT = ftst['counts'][:]
		genes_tst = SP.char.lower(ftst['sym_names'][:])
		#genes_tst = ftst['ensIds'][:]
		#labels_tst = SP.array([1,1,1,1,1])#ftst['labels'][:].ravel() 
		labels_tst = ftst['labels'][:].ravel()
	elif is_Ens == True:
		ftst = h5py.File(CFG['test_file'],'r')
		YT = ftst['counts'][:]
		#genes_tst = ftst['sym_names'][:]
		genes_tst = ftst['ensIds'][:]
		#labels_tst = SP.array([1,1,1,1,1])#ftst['labels'][:].ravel() 
		labels_tst = ftst['labels'][:].ravel() 
	
	if 'class_labels' in ftst.keys():
		class_labels = ftst['class_labels'][:]
	else:
		class_labels = [i.astype('str') for i in labels_tst]
		class_labels = SP.sort(SP.unique(class_labels))
	heterogen_util = genes_util[SP.intersect1d(SP.where(Y_util.mean(0)>0)[0],SP.where(futil['genes_heterogen'][:]==1)[0])]
	heterogen_train = genes[SP.intersect1d(SP.where(Y.mean(0)>0)[0],SP.where(f['genes_heterogen'][:]==1)[0])]
	

	cellcyclegenes_GO = genes[SP.unique(f['cellcyclegenes_filter'][:].ravel() -1)] # idx of cell cycle genes
	cellcyclegenes_CB = genes[f['ccCBall_gene_indices'][:].ravel() -1]		# idxof cell cycle genes ...
	


	if SP.any(gene_set=='GOCB'):	
		cc_ens = SP.union1d(cellcyclegenes_GO,cellcyclegenes_CB)
	elif SP.any(gene_set=='GO'):
		cc_ens = cellcyclegenes_GO 
	elif SP.any(gene_set=='CB'):
		cc_ens = cellcyclegenes_CB 
	elif SP.any(gene_set=='all'):
		cc_ens = genes 
	else:
		#assert(gene_set in CFG.keys()), str(gene_set+' does not exist. Chose different gene set.')
		cc_ens = gene_set 

	
	if het_only==True:
		cc_ens = SP.intersect1d(cc_ens, heterogen_train)
		if pairs==True:
			Y = Y[:,SP.where(f['genes_heterogen'][:]==1)[0]]
			genes = genes[SP.where(f['genes_heterogen'][:]==1)[0]]
	if het_onlyCB==True:
		cc_ens = SP.intersect1d(cc_ens, heterogen_util)
	
	#filter_expressed = .2
	lod = 0
	if filter_expressed>0: 
		medY = SP.sum(Y>lod,0)*1.0
		idx_filter = (medY/SP.float_(Y.shape[0]))>filter_expressed
		Y = Y[:,idx_filter]
		genes = genes[idx_filter]
		
		#medY_tst = SP.sum(Y_tst>lod,0)
		#Y_tst = Y_tst[:,medY_tst>filter_expressed]
		#genes_tst = genes_tst[medY_tst>filter_expressed]		
		
		medY_util = SP.sum(Y_util>lod,0)
		idx_filter = (medY_util/SP.float_(Y_util.shape[0]))>filter_expressed
		Y_util = Y_util[:,idx_filter]
		genes_util = genes_util[idx_filter]		
	
	cc_ens = SP.intersect1d(cc_ens, genes)
	cc_ens = SP.intersect1d(cc_ens, genes_tst)
	cc_ens = SP.intersect1d(cc_ens, genes_util)
		
	if combine==True:
		genes = list(genes)
		genes_util = list(genes_util)
		genes_intersect = SP.intersect1d(genes,genes_util)
		cidx_tr = [ genes.index(x) for x in genes_intersect ]
		cidx_util = [genes_util.index(x) for x in genes_intersect]	
		genes = SP.array(genes)[cidx_tr]
		genes_util = SP.array(genes_util)[cidx_util]
		Y = SP.vstack([Y[:,cidx_tr],Y_util[:,cidx_util]])
		genes = genes_intersect
		labels = SP.hstack([labels, labels_util])				


	Y_tst = YT
	cc_data = {}
	cc_data['cc_ens'] = cc_ens
	cc_data['labels_tst'] = labels_tst	
	cc_data['labels'] = labels
	cc_data['genes_tst'] = genes_tst 
	cc_data['genes'] = genes 
	cc_data['Y'] = Y 
	cc_data['Y_test'] = Y_tst 
	cc_data['class_labels'] = class_labels 
	return cc_data
Example #14
0
	KG2M = SP.zeros((Y.shape[0],Y.shape[0]))
	for iph in range(Y.shape[0]):
		for jph in range(Y.shape[0]):
			if SP.bitwise_and(phase_vec[iph]==phase_vec[jph], phase_vec[iph]==3):
				KG2M[iph,jph]=1

	#intra-phase variations in cell size
	sfCellSize = SP.log10(f['ratioEndo'][:])
	sfCellSize -= sfCellSize.mean()
	sfCellSize = sfCellSize.reshape(1,sfCellSize.shape[0])
	Ksize = SP.dot(sfCellSize.transpose(), sfCellSize)
	Ksize /= Ksize.diagonal().mean() 

	# filter cell cycle genes
	idx_cell_cycle = SP.union1d(cellcyclegenes_filter,cellcyclegenes_filterCB600)
	Ymean2 = Y.mean(0)**2>0
	idx_cell_cycle_noise_filtered = SP.intersect1d(idx_cell_cycle,SP.array(SP.where(Ymean2.ravel()>0)))
	Ycc = Y[:,idx_cell_cycle_noise_filtered]
	
	#Fit GPLVM to data 
	k = 1					 # number of latent factors
	file_name = CFG['panama_file']# name of the cache file
	recalc = True # recalculate X and Kconf
	sclvm = scLVM(Y)
	pdb.set_trace()
	X,Kcc,varGPLVM = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,k=1,out_dir='./cache',file_name=file_name,recalc=recalc)

	#3. load relevant dataset for analysis
	genes_het=SP.array(SP.where(f['genes_heterogen'][:].ravel()==1))
	tech_noise=f['LogVar_techMmus'][:]
Example #15
0
 def _setup_for_IP(self):
     r"""
     Determines cluster labelling and condition for completion
     """
     self._clock_start = misc.tic()
     self._logger.debug( '+='*25)
     self._logger.debug( 'INITIAL SETUP (STEP 1)')
     # if empty, add Pc_entry to throat_properties
     tdia = self._net['throat.'+self._throat_diameter_name]
     # calculate Pc_entry from diameters
     try:
         self['throat.inv_Pc'] = self._phase['throat.'+self._capillary_pressure_name]
     except:
         self._logger.error('Capillary pressure not assigned to '+self._phase.name)
     if self._timing:
         # calculate Volume_coef for each throat
         self._Tvol_coef = tdia*tdia*tdia*np.pi/12/self['throat.inv_Pc']
     # Creating an array for invaded Pores(Np long, 0 for uninvaded, cluster number for inaveded)
     self['pore.cluster_final'] = 0
     self['pore.cluster_original'] = 0
     # Creating an array for invaded throats(Nt long, 0 for uninvaded, cluster number for inaveded)
     self['throat.cluster_final'] = 0
     # Creating arrays for tracking invaded Pores(Np long, 0 for uninvaded, sequence for inaveded)
     self['pore.inv_seq'] =0
     if self._timing:
         # Creating arrays for tracking invaded Pores(Np long, -1 for uninvaded, simulation time for inaveded)
         self['pore.inv_time'] = -1.
     # Creating arrays for tracking invaded throats(Nt long, 0 for uninvaded, sequence for inaveded)
     self['throat.inv_seq'] = 0
     if self._timing:
         # Creating arrays for tracking invaded Pores(Np long, -1 for uninvaded, simulation time for inaveded)
         self['throat.inv_time'] = -1.
     # Iterator variables for sequences and cluster numbers
     clusterNumber = 1
     # Determine how many clusters there are
     self._clusterCount = 0
     for i in self._inlets:
         self._clusterCount += 1
     # Storage for cluster information
     self._cluster_data = {}
     if self._timing:
         self._cluster_data['flow_rate'] = np.ones((self._clusterCount),dtype=float)*self._inlet_flow
         self._cluster_data['haines_pressure'] = np.zeros((self._clusterCount),dtype=float)
         self._cluster_data['haines_time'] = np.zeros((self._clusterCount),dtype=float)
         self._cluster_data['vol_coef'] = np.zeros((self._clusterCount),dtype=float)
         self._cluster_data['cap_volume'] = np.zeros((self._clusterCount),dtype=float)
         self._cluster_data['pore_volume'] = np.zeros((self._clusterCount),dtype=float)
         self._cluster_data['throat_volume'] = np.zeros((self._clusterCount),dtype=float)
     self._cluster_data['haines_throat'] = np.zeros((self._clusterCount),dtype=int)
     self._cluster_data['active'] = np.ones((self._clusterCount),dtype=int)
     self._cluster_data['transform'] = np.zeros((self._clusterCount),dtype=int)
     for i in range(self._clusterCount):
         self._cluster_data['transform'][i] = i+1
     # Creating an empty list to store the list of potential throats for invasion in each cluster.
     # its length is equal to the maximum number of possible clusters.
     self._tlists = [[] for i in self._inlets]
     # Creating a list for each cluster to store both potential throat and corresponding throat value
     self._tpoints = [[] for i in self._inlets]
     # Initializing invasion percolation for each possible cluster
     self._pore_volumes = self._net['pore.'+self._pore_volume_name]
     self._throat_volumes = self._net['throat.'+self._throat_volume_name]
     for pores in self._inlets:
         if sp.shape(pores) == ():
             pores = [pores]
         # Label all invaded pores with their cluster
         self['pore.cluster_original'][pores] = clusterNumber
         # Label all inlet pores as invaded
         self['pore.inv_seq'][pores] = self._tseq
         if self._timing:
             self['pore.inv_time'][pores] = self._sim_time
         # Find all throats that border invaded pores
         interface_throat_numbers = self._net.find_neighbor_throats(pores)
         self.cluster_update(clusterNumber,pores,[],interface_throat_numbers)
         clusterNumber += 1
     if self._timing:
         self._logger.debug( 'pore volumes')
         self._logger.debug(self._cluster_data['pore_volume'])
         self._logger.debug( 'cap volumes')
         self._logger.debug( self._cluster_data['cap_volume'])
     self._logger.debug( 'haines_throats')
     self._logger.debug( self._cluster_data['haines_throat'])
     self._tseq += 1
     self._pseq += 1
     self._current_cluster = 0
     # Calculate the distance between the inlet and outlet pores
     self._outlet_position = np.average(self._net.get_data(prop='coords',pores='all')[self._outlets],0)
     if any([sp.shape(i) > () for i in self._inlets]): 
         inlets = []
         for i in self._inlets:
             inlets = sp.union1d(inlets,i)
         inlets = sp.array(inlets,int)
     else:
         inlets = self._inlets
     inlet_position = np.average(self._net.get_data(prop='coords',pores='all')[inlets],0)
     dist_sqrd = (self._outlet_position-inlet_position)*(self._outlet_position-inlet_position)
     self._initial_distance = np.sqrt(dist_sqrd[0]+dist_sqrd[1]+dist_sqrd[2])
     self._logger.debug( 'initial distance')
     self._logger.debug( self._initial_distance)
     self._current_distance = self._initial_distance
     self._percent_complete = np.round((self._initial_distance-self._current_distance)/self._initial_distance*100, decimals = 1)
     self._logger.info( 'percent complete')
     self._logger.info( self._percent_complete)
     self._rough_complete = 0
     print('     IP algorithm at',np.int(self._rough_complete),'% completion at',np.round(misc.toc(quiet=True)),'seconds')
     self._logger.debug( '+='*25)
Example #16
0
        UniqGenes            = []
        TraitSetAtAlpha      = []
        for i in xrange(len(Traits)):
            GeneSetAtAlpha      = DataDict[Traits[i]]['GeneSetAtAlpha_'+str(Alpha)]
            NTotalGenesOfAlpha += len(GeneSetAtAlpha)
            UniqGenes.extend(GeneSetAtAlpha)
            if(len(GeneSetAtAlpha)>0):
                TraitSetAtAlpha.append(Traits[i])
        TraitSetAtAlpha  = scipy.array(TraitSetAtAlpha)
        GWIntersection   = scipy.intersect1d(ar1=TraitSetAtAlpha,
                                             ar2=GWSignTraits,
                                             assume_unique=False)
        GWMWIntersection = scipy.intersect1d(ar1=TraitSetAtAlpha,
                                             ar2=GWMWSignTraits,
                                             assume_unique=False)
        GWUnion          = scipy.union1d(ar1=TraitSetAtAlpha,
                                         ar2=GWSignTraits)
        GWMWUnion        = scipy.union1d(ar1=TraitSetAtAlpha,
                                         ar2=GWMWSignTraits)
        fw.write(str(Alpha)+'\t'+\
                 str(NTotalGenesOfAlpha)+'\t'+\
                 str(len(scipy.unique(scipy.array(UniqGenes))))+'\t'+\
                 str(len(TraitSetAtAlpha))+'\t'+\
                 str(len(GWSignTraits))+'\t'+\
                 str(len(GWIntersection))+'\t'+\
                 str(float(len(GWIntersection))/float(len(GWUnion)))+'\t'+\
                 str(len(TraitSetAtAlpha))+'\t'+\
                 str(len(GWMWSignTraits))+'\t'+\
                 str(len(GWMWIntersection))+'\t'+\
                 str(float(len(GWMWIntersection))/float(len(GWMWUnion)))+'\n')
    fw.close()
 def _setup_for_IP(self):
     r"""
     Determines cluster labelling and condition for completion
     """
     self._clock_start = misc.tic()
     logger.debug( '+='*25)
     logger.debug( 'INITIAL SETUP (STEP 1)')
     # if empty, add Pc_entry to throat_properties
     tdia = self._net['throat.'+self._throat_diameter_name]
     # calculate Pc_entry from diameters
     try:
         self['throat.inv_Pc'] = self._phase['throat.'+self._capillary_pressure_name]
     except:
         logger.error('Capillary pressure not assigned to invading phase '+self._phase.name
             +', check for capillary pressure in defending phase '+self._phase_def.name +' instead')
         try:
             self['throat.inv_Pc'] = self._phase_def['throat.'+self._capillary_pressure_name]
             self._phase['throat.'+self._capillary_pressure_name] = self._phase_def['throat.'+self._capillary_pressure_name]
         except:
             logger.error('Capillary pressure neither assigned to defending phase '+self._phase_def.name
                 +' nor to invading phase '+self._phase.name)
             pass
     if self._timing:
         # calculate Volume_coef for each throat
         self._Tvol_coef = tdia*tdia*tdia*np.pi/12/self['throat.inv_Pc']
     # Creating an array for invaded Pores(Np long, 0 for uninvaded, cluster number for inaveded)
     self['pore.cluster_final'] = 0
     self['pore.cluster_original'] = 0
     # Creating an array for invaded throats(Nt long, 0 for uninvaded, cluster number for inaveded)
     self['throat.cluster_final'] = 0
     # Creating arrays for tracking invaded Pores(Np long, 0 for uninvaded, sequence for inaveded)
     self['pore.inv_seq'] =0
     # Creating arrays for tracking invaded Pores(Np long, 0 for uninvaded, pressure for inaveded)
     self['pore.inv_pres'] =0
     if self._timing:
         # Creating arrays for tracking invaded Pores(Np long, -1 for uninvaded, simulation time for inaveded)
         self['pore.inv_time'] = -1.
     # Creating arrays for tracking invaded throats(Nt long, 0 for uninvaded, sequence for inaveded)
     self['throat.inv_seq'] = 0
     # Creating arrays for tracking invaded throats(Nt long, 0 for uninvaded, pressure for inaveded)
     self['throat.inv_pres'] = 0
     if self._timing:
         # Creating arrays for tracking invaded Pores(Np long, -1 for uninvaded, simulation time for inaveded)
         self['throat.inv_time'] = -1.
     # Iterator variables for sequences and cluster numbers
     clusterNumber = 1
     # Determine how many clusters there are
     self._clusterCount = 0
     for i in self._inlets:
         self._clusterCount += 1
     # Storage for cluster information
     self._cluster_data = {}
     if self._timing:
         self._cluster_data['flow_rate'] = np.ones((self._clusterCount),dtype=float)*self._inlet_flow
         self._cluster_data['haines_pressure'] = np.zeros((self._clusterCount),dtype=float)
         self._cluster_data['haines_time'] = np.zeros((self._clusterCount),dtype=float)
         self._cluster_data['vol_coef'] = np.zeros((self._clusterCount),dtype=float)
         self._cluster_data['cap_volume'] = np.zeros((self._clusterCount),dtype=float)
         self._cluster_data['pore_volume'] = np.zeros((self._clusterCount),dtype=float)
         self._cluster_data['throat_volume'] = np.zeros((self._clusterCount),dtype=float)
     self._cluster_data['haines_throat'] = np.zeros((self._clusterCount),dtype=int)
     self._cluster_data['active'] = np.ones((self._clusterCount),dtype=int)
     self._cluster_data['transform'] = np.zeros((self._clusterCount),dtype=int)
     for i in range(self._clusterCount):
         self._cluster_data['transform'][i] = i+1
     # Creating an empty list to store the list of potential throats for invasion in each cluster.
     # its length is equal to the maximum number of possible clusters.
     self._tlists = [[] for i in self._inlets]
     # Creating a list for each cluster to store both potential throat and corresponding throat value
     self._tpoints = [[] for i in self._inlets]
     # Initializing invasion percolation for each possible cluster
     self._pore_volumes = self._net['pore.'+self._pore_volume_name]
     self._throat_volumes = self._net['throat.'+self._throat_volume_name]
     for pores in self._inlets:
         if sp.shape(pores) == ():
             pores = [pores]
         # Label all invaded pores with their cluster
         self['pore.cluster_original'][pores] = clusterNumber
         # Label all inlet pores as invaded
         self['pore.inv_seq'][pores] = self._tseq
         self['pore.inv_pres'][pores] = 0
         if self._timing:
             self['pore.inv_time'][pores] = self._sim_time
         # Find all throats that border invaded pores
         interface_throat_numbers = self._net.find_neighbor_throats(pores)
         self.cluster_update(clusterNumber,pores,[],interface_throat_numbers)
         clusterNumber += 1
     if self._timing:
         logger.debug( 'pore volumes')
         logger.debug(self._cluster_data['pore_volume'])
         logger.debug( 'cap volumes')
         logger.debug( self._cluster_data['cap_volume'])
         pass
     logger.debug( 'haines_throats')
     logger.debug( self._cluster_data['haines_throat'])
     self._tseq += 1
     self._pseq += 1
     self._current_cluster = 0
     # Calculate the distance between the inlet and outlet pores
     self._outlet_position = np.average(self._net['pore.coords'][self._outlets],0)
     if any([sp.shape(i) > () for i in self._inlets]):
         inlets = []
         for i in self._inlets:
             inlets = sp.union1d(inlets,i)
         inlets = sp.array(inlets,int)
     else:
         inlets = self._inlets
     inlet_position = np.average(self._net['pore.coords'][inlets],0)
     dist_sqrd = (self._outlet_position-inlet_position)*(self._outlet_position-inlet_position)
     self._initial_distance = np.sqrt(dist_sqrd[0]+dist_sqrd[1]+dist_sqrd[2])
     logger.debug( 'initial distance')
     logger.debug( self._initial_distance)
     self._current_distance = self._initial_distance
     self._percent_complete = np.round((self._initial_distance-self._current_distance)/self._initial_distance*100, decimals = 1)
     logger.info( 'percent complete')
     logger.info( self._percent_complete)
     self._rough_complete = 0
     print('     IP algorithm at',np.int(self._rough_complete),'% completion at',np.round(misc.toc(quiet=True)),'seconds')
     logger.debug( '+='*25)