Esempio n. 1
0
def balanceLabels(D,L):
    """
    modify the data set such that an equal number from
    each class are retained.
    @param D: The input data matrix, samples in rows
    @param L: The label vector, length is the same as the
    number of rows in D
    @return: (D2,L2), where D2 is now balanced by having an
    equal number of samples from each class and L2 are the
    corresponding labels.
    """
    class_sizes = []
    label_groups = []
    L2_idxs = []
    for label in sp.unique(L):
        Lx = sp.flatnonzero(L==label)
        N = len(Lx)
        class_sizes.append(N)
        label_groups.append(Lx)
    min_size = min(class_sizes)
    for ls in label_groups:
        ls2 = sp.random.permutation(ls)  #shuffle
        L2_idxs.append( ls2[:min_size] )
    L2_idxs = sp.hstack(L2_idxs)
    L2 = L[L2_idxs]
    D2 = D[L2_idxs,:]
    return (D2,L2)
def load_flu_mat(matfile="Fluz_dat.mat", times=[14,15,16], id_filter=None):
    '''
    Loads the Duke flu data, only those rows where the response
    is for the given time indexes.
    @param matfile: A matlab file that contains duke influenza data,
    but converted to a format easily read by python/scipy. The .mat file
    should contain four matrices (not cell arrays), D, L, Time, ID,
    where D is the full data, L is the corresponding labels, Time is the time
    index of the data, and ID is the subject id for each row of data.
    @param times: A list of time indexes that is used to filter the
    data. Only those rows corresponding to these time indexes will be
    included. Specify None to include all time steps.
    @param id_filter: A list of ids to FILTER (remove) from the data.
    Specify None to include all subjects.
    @return: tuple (D,L,Time,ID) as numpy arrays with the filtered data.
    @note: For Duke Fluz data, use times=[14,15,16] and id_filter=None. For
    Duke H1N1 data, use times=[14,15,16] and id_filter = [1,3,5,10,18]
    '''
    print "Loading data from %s in directory: %s"%(matfile, ifr.DUKE_PYDAT_DIR)
    fn = os.path.join(ifr.DUKE_PYDAT_DIR, matfile)
    tmp = spio.loadmat(fn)
    D = tmp['D']  #full data
    L = tmp['L'].flatten()  #all labels
    Time = tmp['Time'].flatten()  #indicates the time index for each row
    Ids = tmp['ID'].flatten() #subject id for each row
    if not (times is None):
        idxs = []
        for t in times:
            idxs += list(sp.flatnonzero(Time==t))
        
        D = D[idxs,:]
        L = L[idxs,:]
        Time = Time[idxs,:]
        Ids = Ids[idxs,:]
        
    if not (id_filter is None):
        idxs = []
        for i in list(sp.unique(Ids)):
            if not (i in id_filter):
                idxs += list(sp.flatnonzero(Ids==i))
        
        D = D[idxs,:]
        L = L[idxs,:]
        Time = Time[idxs,:]
        Ids = Ids[idxs,:]
    
    return (D,L,Time,Ids)        
Esempio n. 3
0
 def share_slices(counts):
     cumcounts = scipy.cumsum(counts)
     cedges = scipy.linspace(0, cumcounts[-1] + 1, ncuts + 1)
     cutnumber = scipy.digitize(cumcounts, cedges) - 1
     assert (cutnumber >= 0).all() and (cutnumber < ncuts).all()
     return [
         scipy.flatnonzero(cutnumber == icut) for icut in range(ncuts)
     ]
Esempio n. 4
0
def rand_vect_in_sphere(dim, n):
    qs = zeros((dim, n))
    rep = array(range(n))
    while(True):
        if len(rep) == 0:
            return qs
        qs[:,rep] = 2.0*rand(dim, len(rep)) - 1.0
        rep = rep[flatnonzero(sqrt(sum(qs[:,rep]**2.0, 0)) > 1.0)]
Esempio n. 5
0
def generate_ensemble(n, temp, F, mu, mol_I):
    qs = zeros((n, 7))
    
    """
    the angular velocity is chosen by applying the metropolis method
    for the initial distribution we choose a vector uniformly distributed inside 
    of a sphere of radius w_cf = A * sqrt( temp * kb_au / mean(mol_I) )
    where A is a cutoff factor, chosen so that the probability of encountering a 
    vector longer than w_cf is negligable
    """

    kT = kb_au * temp
    w_cf = A * sqrt( temp * kb_au / mean(mol_I) )

    qs = zeros((7,n))
    rep = array(range(n))
    while(True):
        if len(rep) == 0:
            return qs
        qs[0:4,rep] = rand_unit_vect(4, len(rep))
        qs[4:7,rep] = w_cf * rand_vect_in_sphere(3, len(rep))
        rep = rep[flatnonzero( rand(len(rep)) > exp(-ar.total_energy(qs[:,rep], 
            F, mu, mol_I)/kT))]
Esempio n. 6
0
def random_partition_idxs(L, frac=0.25, proportional_labels=True):
    '''
    Generates indexes to partition data for train/test splits. This
    function can generate random partitioning (ShuffleSplit) or
    random partitioning where the labels are kept proportional (Stratified Shuffle).
    @param L: The label vector as a numpy array or list. This is used to get the
    number of samples in the data len(L), and also for data stratification along
    class labels if proportional_labels is True.
    @param frac: The percentage of samples to include in test partition
    @param proportional_labels: If true, the frac will be applied to each
    label. If false, frac will be applied to the entire data set, ignoring labels.
    Only with large data sets having approximately equal labels should you NOT
    do proportional partitioning.
    @return: (train_idxs, test_idxs, p), where p is the random permutation or
    list of permutations (one per class) if proportional is True.
    '''
        
    if not proportional_labels:
        N = len(L)
        p = sp.random.permutation(range(N))
        Ntest = int(frac*N)
        test_idxs  = sorted( p[0:Ntest] )
        train_idxs = sorted( p[Ntest:]  )
    else:
        test_idxs = []
        p = []
        for label in sp.unique(L):  #this also sorts the unique labels
            Lx = sp.flatnonzero(L==label) #indexes in L where L == label
            N = len(Lx)
            px = sp.random.permutation(range(N))
            Ntest = int(frac*N)
            test_idxs += list( Lx[px[0:Ntest]])
            p.append(px)
        test_idxs = sorted(test_idxs)
        train_idxs = sorted( set(range(len(L))) - set(test_idxs))
            
    return (train_idxs, test_idxs, p)
Esempio n. 7
0
def getMinClassSize(L):
    minsize = sp.inf
    for label in sp.unique(L):
        Lx = sp.flatnonzero(L==label)
        if len(Lx) < minsize: minsize = len(Lx)
    return minsize
Esempio n. 8
0
   
    print "I = ", mol_I

    print "kinetic energy = (K)", 0.5 * dot(w0.T, mol_I*w0) / kB_au
    print "pF / kT = ", sqrt(sum(mu**2.0)) * F / 0.5 / dot(w0.T, mol_I*w0)  
    print "time at final time step is (ns) = ", qt[-1,0] / 1000.

    print "now computing asym_rotor_muz: "
    muzcomp = asym_rotor_muz(t0, t1, t2, 1000000, r_[q0, w0], F, mu, mol_I)
    print "muzcomp = ", muzcomp

    tsr = qt[:,0]

    # field in rk time steps
    ft =  F * (tsr - t0) / (t1 - t0)
    ft[flatnonzero(tsr < t0)] = 0.0;
    ft[flatnonzero(tsr > t1)] = F

    t, T, V, U, Jx, Jy, Jz, fsp, fspav = stats(qt, ft, mu, mol_I)
    #t, T, V, U = stats(qt, ft, mu, mol_I)
    
    pl.figure(figsize=(4,4))
    
    pl.subplot(311)
    pl.plot(t/1e3, T, 'g-', lw=0.5, alpha=0.5, label="T")
    pl.plot(t/1e3, V, 'b-', lw=0.5, alpha=0.5, label="V")
    pl.plot(t/1e3, U, 'r-', lw=1.5, label="U")
    
    pl.subplot(312)
    #pl.plot(t/1e3, sqrt(nm.sum(qt[:,1:5]**2.0,axis=1)), 'r-', label="|q|")
    pl.plot(t/1e3, ft, 'r-', label="|q|")
Esempio n. 9
0
def initialise_community(nrows, ncols, b_density, M, c, x, fix_abundance=False, species_richness=1, max_diversity=False):
	"""
	Sets up:
	- the simulated community (a 3D array)
	- its initial species richness (amount of unique integers)
	- abundance of positions (amount of individuals per position - length of array's 3rd dimension).
	"""
	#~ pdb.set_trace()
	
	abundance_per_m2 = b_density * M**-0.75
	# Calculate the number of individuals in 1 m^2.
	
	S_r = sc.arange(nrows)
	T_r, T_theta = nrows, ncols
	cell_areas = (sc.pi * c * ((((S_r + 1) * x) / T_r)**2 - ((S_r * x) / T_r)**2)) / T_theta
	cell_abundances = sc.around(abundance_per_m2 * cell_areas, 0).astype(sc.int64, copy=False)
	# For each altitudinal band, calculate the number of individuals in, and area of, a cell.
	# Amount of individuals corresponds to array size, so round abundance to the nearest whole number.
	# A mountain base covers more area than the top. I use a cone's surface as a model of a mountain, but, in silico, I represent the surface as a square array. Each row in the array is an altitudinal band. Going up the mountain, each [row, column] position in the array represents an increasingly narrow area.
	
	if fix_abundance == True:
		community = sc.ones((nrows, ncols, cell_abundances[14]), dtype=sc.int64)
		# fix abundances - use that of middle altitudinal band
	else:
		max_cell_abundance = sc.amax(cell_abundances)
		community = sc.zeros((nrows, max_cell_abundance), dtype=sc.int64)
		for i in range(nrows):
			community[i, :cell_abundances[i]] = 1
		community = sc.repeat(community, ncols, axis=0).reshape(nrows, ncols, max_cell_abundance)
		# Make a 2D array of zeros. Each row is an altitudinal band. The 2nd dimension's length is the max cell abundance.
		# For each row, change the first x items to 1; x is the cell abundance in the band.
		# Replicate each row; the number of replicates is the number of positions along a band (ncols).
		# `sc.repeat(a, repeats, axis)` repeats array elements. `repeats` - number of repeats per element. `axis` - axis along which to repeat values. Returns a flat array.
		# -`sc.reshape` reshapes an array.
	
	#~ pdb.set_trace()
	community_size = sc.flatnonzero(community).size
	# Count non-zero items - individuals. (The total differs slightly from `density`, as the function rounds cell abundances.)
	# `sc.flatnonzero(a)` returns indices of non-zero items (in the flattened version of a).
	# `sc.size` returns the number of elements.
	
	if species_richness > community_size:
		sys.exit("The number of species (`species_richness`) cannot exceed number of individuals (the system's size, which is `nrows` * `ncols` * `density`).")
		# Exit the program and print an error message.
	if max_diversity == True:
		community.ravel()[sc.flatnonzero(community)] = sc.arange(community_size) + 1
		# Generate an initial state with the max number of species for the community size.
		# `sc.flatnonzero` - don't change the value of zeros - these aren't individuals.
		# '+ 1' as 0 is not a species identity.
	elif species_richness > 1:
		species = sc.arange(species_richness) + 1
		# Make a 1D array of species identities (integers).
		
		community.ravel()[sc.flatnonzero(community)[:species_richness]] = species
		# There must be at least one individual per species.
		
		community.ravel()[sc.flatnonzero(community)[species_richness:]] = sc.random.choice(species, size=community_size - species_richness, replace=True)
		# The remaining individuals can take any species identity. I.e., each species has random abundance.
	
	community.ravel()[sc.flatnonzero(community)] = sc.random.permutation(community.ravel()[sc.flatnonzero(community)])
	# Randomly permutate the non-zero items of `community` (i.e. keep zeros in place), so an individual is equally likely to take any species identity.
	# Without this line, the first `species_richness` individuals always will be different species.
	
	return community, cell_areas, cell_abundances