def setup(self, conductance, quantity, super_pore_conductance): r""" This setup provides the initial data for the solver from the provided properties. It also creates the matrices A and b. """ # Assigning super_pore conductance for Neumann_group BC if super_pore_conductance is None: self.super_pore_conductance = [] else: self.super_pore_conductance = super_pore_conductance # Providing conductance values for the algorithm from the Physics name if sp.size(self._phase) == 1: self._conductance = 'throat.' + conductance.split('.')[-1] self._quantity = 'pore.' + quantity.split('.')[-1] # Check health of conductance vector if self._phase.check_data_health(props=self._conductance).health: self['throat.conductance'] = self._phase[self._conductance] else: raise Exception('The provided throat conductance has problems') else: raise Exception('The linear solver accepts just one phase.') # Checking for the linear terms to be added to the coeff diagonal/RHS diag_added_data = sp.zeros(self.Np) RHS_added_data = sp.zeros(self.Np) for label in self.labels(): if 'pore.source_' in label: source_name = 'pore.' + \ (label.split('.')[-1]).replace('source_', '') matching_physics = [phys for phys in self._phase._physics if source_name in phys.models.keys()] for phys in matching_physics: x = phys.models[source_name]['x'] if x != '' and type(x) == str: if x.split('.')[-1] != quantity.split('.')[-1]: raise Exception('The quantity(pore.' + x.split('.')[-1] + '), provided by source term(' + source_name + '), is different ' + 'from the main quantity(pore.' + quantity.split('.')[-1] + ') in ' + self.name + ' algorithm.') source_name = label.replace('pore.source_', '') if 'pore.source_linear_s1_' + source_name in self.props(): prop1 = 'pore.source_linear_s1_' + source_name pores = ~sp.isnan(self[prop1]) diag_added_data[pores] = diag_added_data[pores] + \ self[prop1][pores] prop2 = 'pore.source_linear_s2_' + source_name pores = ~sp.isnan(self[prop2]) RHS_added_data[pores] = RHS_added_data[pores] + \ self[prop2][pores] # Creating A and b based on the conductance values and new linear terms logger.info('Creating Coefficient matrix for the algorithm') d = diag_added_data self.A = self._build_coefficient_matrix(modified_diag_pores=self.Ps, diag_added_data=d) logger.info('Creating RHS matrix for the algorithm') self.b = self._build_RHS_matrix(modified_RHS_pores=self.Ps, RHS_added_data=-RHS_added_data)
def create_models(self): import scipy,cPickle from stellarpop import tools from stellarpop.ndinterp import ndInterp index = {} shape = [] axes = {} axes_index = 0 for key in self.axes_names: index[key] = {} shape.append(self.axes[key]['points'].size) axes[axes_index] = self.axes[key]['eval'] axes_index += 1 for i in range(self.axes[key]['points'].size): index[key][self.axes[key]['points'][i]] = i models = {} model = scipy.empty(shape)*scipy.nan for f in self.filter_names: models[f] = {} for z in self.redshifts: models[f][z] = model.copy() for file in self.files: f = open(file,'rb') data = cPickle.load(f) wave = cPickle.load(f) f.close() for key in data.keys(): obj = data[key] jj = key spec = obj['sed'] ind = [] for key in self.axes_names: try: ind.append([index[key][obj[key]]]) except: print key,index[key] print obj df for f in self.filter_names: for i in range(len(self.redshifts)): z = self.redshifts[i] # correction is the units correction factor correction = self.corrections[i] sed = [wave,spec*correction] mag = tools.ABFilterMagnitude(self.filters[f],sed,z) if scipy.isnan(mag)==True: df models[f][z][ind] = mag for f in self.filter_names: for z in self.redshifts: model = models[f][z].copy() if scipy.isnan(model).any(): models[f][z] = None else: models[f][z] = ndInterp(axes,model) return models
def zeroMeanUnitVarianz(data=None,x=True): if x: return (data-data.mean(axis=0))/data.std(axis=0) else: mean = data[~sp.isnan(data)].mean(axis=0) std = data[~sp.isnan(data)].std(axis=0) return (data - mean)/std
def _do_one_outer_iteration(self, **kwargs): r""" One iteration of an outer iteration loop for an algorithm (e.g. time or parametric study) """ # Checking for the necessary values in Picard algorithm nan_tol = sp.isnan(self['pore.source_tol']) nan_max = sp.isnan(self['pore.source_maxiter']) self._tol_for_all = sp.amin(self['pore.source_tol'][~nan_tol]) self._maxiter_for_all = sp.amax(self['pore.source_maxiter'][~nan_max]) if self._guess is None: self._guess = sp.zeros(self._coeff_dimension) t = 1 step = 0 # The main Picard loop while t > self._tol_for_all and step <= self._maxiter_for_all: X, t, A, b = self._do_inner_iteration_stage(guess=self._guess, **kwargs) logger.info('tol for Picard source_algorithm in step ' + str(step) + ' : ' + str(t)) self._guess = X step += 1 # Check for divergence self._steps = step if t >= self._tol_for_all and step > self._maxiter_for_all: raise Exception('Iterative algorithm for the source term reached ' 'to the maxiter: ' + str(self._maxiter_for_all) + ' without achieving tol: ' + str(self._tol_for_all)) logger.info('Picard algorithm for source term converged!') self.A = A self.b = b self._tol_reached = t return X
def LDA_batch_normalization(dataset, sample_table, batch_col, output_folder, ncomps): # this is actually the batch normalization method tmp_output_folder = os.path.join(output_folder, 'tmp') if not os.path.isdir(tmp_output_folder): os.makedirs(tmp_output_folder) barcodes, filtered_conditions, filtered_matrix, conditions, matrix = dataset # Remove any remaining NaNs and Infs from the filtered matrix - they would screw # up the LDA. filtered_matrix[scipy.isnan(filtered_matrix)] = 0 filtered_matrix[scipy.isinf(filtered_matrix)] = 0 # For full matrix, also eliminate NaNs and Infs, BUT preserve the indices and values # so they can be added back into the matrix later (not implemented yet, and may never # be - there should no longer be NaNs and Infs in the dataset) # The NaNs and Infs will mess up the final step of the MATLAB LDA script, which uses # matrix multiplication to remove the specified number of components! matrix_nan_inds = scipy.isnan(matrix) matrix_nan_vals = matrix[matrix_nan_inds] matrix_inf_inds = scipy.isinf(matrix) matrix_inf_vals = matrix[matrix_inf_inds] matrix[matrix_nan_inds] = 0 matrix[matrix_inf_inds] = 0 # Save both the small matrix (for determining the components to remove) and the # full matrix for the matlab script filtered_matrix_tmp_filename = os.path.join(tmp_output_folder, 'nonreplicating_matrix.txt') full_matrix_tmp_filename = os.path.join(tmp_output_folder, 'full_matrix.txt') np.savetxt(filtered_matrix_tmp_filename, filtered_matrix) np.savetxt(full_matrix_tmp_filename, matrix) # Map the batch to integers for matlab, and write out to a file so matlab can read # Note that yes, the batch_classes should match up with the filtered matrix, not # the full matrix batch_classes = get_batch_classes(dataset = [barcodes, filtered_conditions, filtered_matrix], sample_table = sample_table, batch_col = batch_col) class_tmp_filename = os.path.join(tmp_output_folder, 'classes.txt') writeList(batch_classes, class_tmp_filename) output_tmp_filename = os.path.join(tmp_output_folder, 'full_matrix_lda_normalized.txt') runLDAMatlabFunc(filtered_matrix_filename = filtered_matrix_tmp_filename, \ matrix_filename = full_matrix_tmp_filename, \ class_filename = class_tmp_filename, \ ncomps = ncomps, \ output_filename = output_tmp_filename) # The X norm that is returned is the full matrix. In the future, we could add in # returning the components to remove so they can be visualized or applied to other # one-off datasets Xnorm = scipy.genfromtxt(output_tmp_filename) ## Dump the dataset out! #output_filename = os.path.join(mtag_effect_folder, 'scaleddeviation_full_mtag_lda_{}.dump.gz'.format(ncomps)) #of = gzip.open(output_filename, 'wb') #cPickle.dump([barcodes, conditions, Xnorm], of) #of.close() return [barcodes, conditions, Xnorm]
def ProcessData(data): data = data[::-1] n = 100 growthOfThisData = 0 fitPrice = FitPrice(data) if fitPrice == 0: return print("FitResult : " + str(fitPrice)) for i in range(0, len(data) - n): if not (sp.isnan(data[i][1]) or sp.isnan(data[i][4]) or sp.isnan(data[i][5])): if data[i][5] > 0: maxPrice = MaxPriceInNextNDays(data, i, n, fitPrice) minPrice = MinPriceInNextNDays(data, i, n, fitPrice) currentPrice = data[i][4] / fitPrice key = (currentPrice // 0.05) * 0.05 if maxPriceResult.has_key(key): maxPriceResult[key] += maxPrice numOfDataMax[key] += 1 else: maxPriceResult[key] = maxPrice numOfDataMax[key] = 1 if minPriceResult.has_key(key): minPriceResult[key] += minPrice numOfDataMin[key] += 1 else: minPriceResult[key] = minPrice numOfDataMin[key] = 1
def _prepareICContents(self): allfilestr = "" topstr = "function ics_ = " + self.name +"_ics()\n" commentstr = "% Initial conditions for model " + self.name + "\n% Generated by PyDSTool for ADMC++ target\n\n" bodystr = "ics_ = [ ...\n" if self.initialconditions: icnames = self.initialconditions.keys() icnames.sort() for i in range(len(icnames)-1): if isnan(self.initialconditions[icnames[i]]): val = str(0.0) else: val = str(self.initialconditions[icnames[i]]) bodystr += val + ", ... % " + icnames[i] + "\n" if isnan(self.initialconditions[icnames[len(icnames)-1]]): val = str(0.0) else: val = self.initialconditions[icnames[len(icnames)-1]] bodystr += val + " % " + icnames[len(icnames)-1] + " ...\n" bodystr += "];\n" allfilestr = topstr + commentstr + bodystr return allfilestr
def main(): data = sp.genfromtxt('./data/web_traffic.tsv', delimiter='\t') x = data[:, 0] y = data[:, 1] x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] fp1 = sp.polyfit(x, y, 1) print('Model parameters for fp1 %s' % fp1) f1 = sp.poly1d(fp1) print('This is the error rate for fp1 %f' % error(f1, x, y)) fp2 = sp.polyfit(x, y, 2) print('Model parameters for fp2 %s' % fp2) f2 = sp.poly1d(fp2) print('This is the error rate for fp2 %f' % error(f2, x, y)) plt.scatter(x, y,color= 'pink') plt.title('My first impression') plt.xlabel('Time') plt.ylabel('#Hits') plt.xticks([w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)]) fx = sp.linspace(0, x[-1], 1000) plt.plot(fx, f1(fx), linewidth=3,color='cyan') plt.plot(fx, f2(fx), linewidth=3, linestyle='--',color= 'red') plt.legend(['d = %i' %f1.order, 'd = %i' %f2.order], loc='upper left') plt.autoscale(tight=True) plt.grid() plt.show()
def init_and_cleanup_data(path, delimiter): data = sp.genfromtxt(path, delimiter=delimiter) hours = data[:, 0] # contains the hours webhits = data[:, 1] # contains the number of web hits at a particular hour hours = hours[~sp.isnan(webhits)] webhits = webhits[~sp.isnan(webhits)] return (hours, webhits)
def simulate(self, X): """ @arguments X -- 2d array of [sample_i][var_i] : float @return y -- 1d array of [sample_i] : float """ op = self.nonlin_op ok = True y_lin = self.simple_base.simulate(X) if op == OP_ABS: ya = numpy.abs(y_lin) elif op == OP_MAX0: ya = numpy.clip(y_lin, 0.0, INF) elif op == OP_MIN0: ya = numpy.clip(y_lin, -INF, 0.0) elif op == OP_LOG10: #safeguard against: log() on values <= 0.0 mn, mx = min(y_lin), max(y_lin) if mn <= 0.0 or scipy.isnan(mn) or mx == INF or scipy.isnan(mx): ok = False else: ya = numpy.log10(y_lin) elif op == OP_GTH: ya = numpy.clip(self.thr - y_lin, 0.0, INF) elif op == OP_LTH: ya = numpy.clip(y_lin - self.thr, 0.0, INF) else: raise 'Unknown op %d' % op if ok: #could always do ** exp, but faster ways if exp is 0,1 y = ya else: y = INF * numpy.ones(X.shape[0], dtype=float) return y
def getFluxes(val_mat, direction_mat, dist_mat, duxdy_mat, out_flux, inc): import scipy; import math; speed_factor = 1; angle_factor = 1; inc_factor = 1; dist_factor = 1; strain_factor = 1; duxdy_mat = duxdy_mat / (sum(duxdy_mat[~scipy.isnan(duxdy_mat)])); cell_angles = scipy.flipud(scipy.array([[-1 * math.pi / 4, -1 * math.pi / 2, -3 * math.pi / 4], [0, scipy.nan, math.pi], [math.pi / 4, math.pi / 2, 3 * math.pi / 4]])); # cell_angles = scipy.flipud(scipy.array([[3 * math.pi / 4, 1 * math.pi / 2, 1 * math.pi / 4], [math.pi, scipy.nan, 0], [-3 * math.pi / 4, -1 * math.pi / 2, -1 * math.pi / 4]])); cell_incs = scipy.array([[(inc**2 + inc**2)**0.5, inc, (inc**2 + inc**2)**0.5], [inc, scipy.nan, inc], [(inc**2 + inc**2)**0.5, inc, (inc**2 + inc**2)**0.5]]); cell_incs = (1 / cell_incs**inc_factor); cell_incs = cell_incs / sum(cell_incs[~scipy.isnan(cell_incs)]); vels_in = scipy.cos(cell_angles - direction_mat); vels_in[1,1] = scipy.nan; vels_in[vels_in < 0.00001] = scipy.nan; vels_in = vels_in**angle_factor * val_mat**speed_factor * dist_mat**dist_factor * (1 / duxdy_mat**strain_factor) * cell_incs; in_fluxes = (vels_in / sum(vels_in[~scipy.isnan(vels_in)]) * out_flux); return in_fluxes;
def __call__(self,x1, x2, d1=[sp.NaN], d2=[sp.NaN],gets=False): D1 = 0 if sp.isnan(d1[0]) else int(sum([8**x for x in d1])) D2 = 0 if sp.isnan(d2[0]) else int(sum([8**x for x in d2])) self.smodel=sp.empty(1) r=libGP.k(x1.ctypes.data_as(ctpd),x2.ctypes.data_as(ctpd), cint(D1),cint(D2),cint(self.dim),self.ihyp.ctypes.data_as(ctpd),cint(self.Kindex),self.smodel.ctypes.data_as(ctpd)) if gets: return [r,self.smodel[0]] return r
def load_data(): datas = sp.genfromtxt("web_traffic.tsv", delimiter='\t') print datas[:10] x = datas[:,0] y = datas[:,1] x = x[ ~sp.isnan(y)] y = y[ ~sp.isnan(y)] return x,y
def preProcess(self, periodF0 = 0.06, deltaF_div_F0 = True, max_threshold = None, min_threshold = None, nan_to_zeros = True, detrend = False, #~ band_filter = None, gaussian_filter = None, f1 = None, f2 = None, **kargs): images = self.images if deltaF_div_F0: ind = self.t()<=self.t_start+periodF0 m0 = mean(images[ind,:,:] , axis = 0) images = (images-m0)/m0*1000. if max_threshold is not None: #~ images[images>max_threshold] = max_threshold images[images>max_threshold] = nan if min_threshold is not None: #~ images[images<min_threshold] = min_threshold images[images<min_threshold] = nan if nan_to_zeros: images[isnan(images) ] = 0. if detrend and not nan_to_zeros: m = any(isnan(images) , axis = 0) images[isnan(images) ] = 0. images = signal.detrend( images , axis = 0) images[:,m] = nan elif detrend and nan_to_zeros: images = signal.detrend( images , axis = 0) if gaussian_filter is not None: images = ndimage.gaussian_filter( images , (0 , gaussian_filter , gaussian_filter)) if f1 is not None or f2 is not None: from ..computing.filter import fft_passband_filter if f1 is None: f1=0. if f2 is None: f1=inf nq = self.sampling_rate/2. images = fft_passband_filter(images, f_low = f1/nq , f_high = f2/nq , axis = 0) return images
def get_data(): data = sp.genfromtxt("input/web_traffic.tsv", delimiter="\t") x = data[:, 0] y = data[:, 1] x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] return (x, y,)
def test_returns_nan_if_one_spike_train_is_empty(self): empty = create_empty_spike_train() non_empty = neo.SpikeTrain(sp.array([1.0]) * pq.s, t_stop=2.0 * pq.s) k = sigproc.GaussianKernel() with warnings.catch_warnings(): warnings.simplefilter('ignore') actual = stm.schreiber_similarity((empty, non_empty), k) self.assertTrue(sp.isnan(actual[0, 0])) self.assertTrue(sp.isnan(actual[0, 1])) self.assertTrue(sp.isnan(actual[1, 0]))
def LDA_batch_normalization(dataset, sample_table, batch_col, output_folder, n_comps): # this is actually the batch normalization method tmp_output_folder = os.path.join(output_folder, 'tmp') if not os.path.isdir(tmp_output_folder): os.makedirs(tmp_output_folder) barcodes, filtered_conditions, filtered_matrix, conditions, matrix = dataset # Remove any remaining NaNs and Infs from the filtered matrix - they would screw # up the LDA. filtered_matrix[scipy.isnan(filtered_matrix)] = 0 filtered_matrix[scipy.isinf(filtered_matrix)] = 0 # For full matrix, also eliminate NaNs and Infs, BUT preserve the indices and values # so they can be added back into the matrix later (not implemented yet, and may never # be - there should no longer be NaNs and Infs in the dataset) # The NaNs and Infs will mess up the final step of the MATLAB LDA script, which uses # matrix multiplication to remove the specified number of components! matrix_nan_inds = scipy.isnan(matrix) matrix_nan_vals = matrix[matrix_nan_inds] matrix_inf_inds = scipy.isinf(matrix) matrix_inf_vals = matrix[matrix_inf_inds] matrix[matrix_nan_inds] = 0 matrix[matrix_inf_inds] = 0 # Save both the small matrix (for determining the components to remove) and the # full matrix for the matlab script filtered_matrix_tmp_filename = os.path.join(tmp_output_folder, 'nonreplicating_matrix.txt') full_matrix_tmp_filename = os.path.join(tmp_output_folder, 'full_matrix.txt') np.savetxt(filtered_matrix_tmp_filename, filtered_matrix) np.savetxt(full_matrix_tmp_filename, matrix) # Map batch classes to integers batch_classes = get_batch_classes(dataset = [barcodes, filtered_conditions, filtered_matrix], sample_table = sample_table, batch_col = batch_col) # Checks number of classes and limits ncomps a = [x > 0 for x in np.sum(np.absolute(filtered_matrix), axis=0)] classes = np.asarray([batch_classes[i] for i in range(len(batch_classes)) if a[i]]) n_samples = filtered_matrix.shape[0] n_classes = len(np.unique(classes)) if n_samples == n_classes: print "ERROR: The number of samples is equal to the number of classes. Exiting" if n_classes <= n_comps: print "Fewer classes, " + str(n_classes) + ", than components. Setting components to " + str(n_classes-1) n_comps = n_classes-1 # Runs LDA #Xnorm = scikit_lda(filtered_matrix, matrix, batch_classes, n_comps) Xnorm = outer_python_lda(filtered_matrix, matrix, batch_classes, n_comps) return [barcodes, conditions, Xnorm, n_comps]
def load_samples(fname): """ Load training sample dataset """ data = sp.genfromtxt(fname, delimiter='\t') x = data[:, 0] y = data[:, 1] print('Totally %i entries while %i invalid entries.' % (sp.shape(data)[0], sp.sum(sp.isnan(y)))) x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] return (x, y)
def get_cleaned_data(): data = sp.genfromtxt(os.path.join(DATA_DIR, 'web_traffic.tsv'), delimiter='\t') x = data[:, 0] y = data[:, 1] print "Number of invalid entries: {}".format(sp.sum(sp.isnan(y))) print "Removing invalid entries." x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] print "Number of invalid entries: {}".format(sp.sum(sp.isnan(y))) return x, y
def get_relative_prices(walking_time, smoothed_prices): x = walking_time.flatten() y = smoothed_prices.flatten() mask = sp.isnan(x) | sp.isnan(y) spline = sp.interpolate.UnivariateSpline(x[~mask], y[~mask], s=len(x)) v = spline(x) rel = (y - v).reshape(walking_time.shape) return rel
def terminate(self, maxsteps): """ Termination criteria """ if maxsteps is not None: if self._num_updates >= maxsteps: return True if self.loss_target is not None: l = self.provider.currentLosses(self.bestParameters) if mean(l) <= self.loss_target: return True if sum(isnan(self.parameters)) + sum(isnan(self.parameters)) > 0: print 'Diverged' return True return False
def setXY(self): global x, y, xa, xb, ya, yb x = data[:, 0] y = data[:, 1] x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] inflection = 3.5 * 7 * 24 xa = x[:inflection] ya = y[:inflection] xb = x[inflection:] yb = y[inflection:]
def test_ransohoff_snapoff_verts(self): ws = op.Workspace() ws.clear() bp = sp.array([[0.25, 0.25, 0.25], [0.25, 0.75, 0.25], [0.75, 0.25, 0.25], [0.75, 0.75, 0.25], [0.25, 0.25, 0.75], [0.25, 0.75, 0.75], [0.75, 0.25, 0.75], [0.75, 0.75, 0.75]]) scale = 1e-4 sp.random.seed(1) p = (sp.random.random([len(bp), 3])-0.5)/1000 bp += p fiber_rad = 2e-6 bp = op.topotools.reflect_base_points(bp, domain_size=[1, 1, 1]) prj = op.materials.VoronoiFibers(fiber_rad=fiber_rad, resolution=1e-6, shape=[scale, scale, scale], points=bp*scale, name='test') net = prj.network del_geom = prj.geometries()['test_del'] vor_geom = prj.geometries()['test_vor'] f = op.models.physics.capillary_pressure.ransohoff_snap_off water = op.phases.GenericPhase(network=net) water['pore.surface_tension'] = 0.072 water['pore.contact_angle'] = 45 phys1 = op.physics.GenericPhysics(network=net, geometry=del_geom, phase=water) phys1.add_model(propname='throat.snap_off', model=f, wavelength=fiber_rad) phys1.add_model(propname='throat.snap_off_pair', model=f, wavelength=fiber_rad, require_pair=True) phys2 = op.physics.GenericPhysics(network=net, geometry=vor_geom, phase=water) phys2.add_model(propname='throat.snap_off', model=f, wavelength=fiber_rad) phys2.add_model(propname='throat.snap_off_pair', model=f, wavelength=fiber_rad, require_pair=True) ts = ~net['throat.interconnect'] assert ~sp.any(sp.isnan(water['throat.snap_off'][ts])) assert sp.any(sp.isnan(water['throat.snap_off_pair'][ts])) assert sp.any(~sp.isnan(water['throat.snap_off_pair'][ts]))
def load_dataset2(dataset_name): try: data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "data") except NameError: data_dir = "../data" data = sp.genfromtxt(os.path.join(data_dir, "{0}.tsv".format(dataset_name)), delimiter="\t") #schema 'features label' print(data[:10]) #print first 10 row for a peek of the data # all examples will have three classes in this file x = data[:, 0] # take first column y = data[:, -1] #take last column print("Number of invalid entries:", sp.sum(sp.isnan(y))) #value being nan in y #clean the data, remove rows with nan value x = x[~sp.isnan(y)] y = y[~sp.isnan(y)]
def test_interleave_data_float(self): net = OpenPNM.Network.Cubic(shape=[2, 2, 2]) Ps = net.pores('top') geom1 = OpenPNM.Geometry.GenericGeometry(network=net, pores=Ps) Ps = net.pores('bottom') geom2 = OpenPNM.Geometry.GenericGeometry(network=net, pores=Ps) geom1['pore.blah'] = 1.0 # Ensure flaots are returned geom1 assert 'float' in geom1['pore.blah'].dtype.name # Ensure nans are returned on geom2 assert sp.all(sp.isnan(geom2['pore.blah'])) # Ensure interleaved array is float with nans assert 'float' in net['pore.blah'].dtype.name # Ensure missing values are floats assert sp.sum(sp.isnan(net['pore.blah'])) == 4
def main(): data = sp.genfromtxt("web_traffic.tsv", delimiter="\t") plt.xkcd() x = data[:, 0] y = data[:, 1] x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] fp1, _, _, _, _ = sp.polyfit(x, y, 1, full=True) # Here we try 3 degrees of freedom fp2, _, _, _, _ = sp.polyfit(x, y, 3, full=True) f1 = sp.poly1d(fp1) f2 = sp.poly1d(fp2) # We have an obvious inflection point between 3rd and 4th week inflection_in_hours = int(3.5 * 7 * 24) x_before_inflection = x[:inflection_in_hours] x_after_inflection = x[inflection_in_hours:] y_after_inflection = y[inflection_in_hours:] f_after = sp.poly1d(sp.polyfit(x_after_inflection, y_after_inflection, 1)) fx = sp.linspace(0, x[-1], 1000) fx_after = sp.linspace(len(x_before_inflection)+1, x[-1], 1000) plt.scatter(x, y, s=5) plt.title("Web traffic over the last month.") plt.xlabel("Time") plt.ylabel("Hits/hour") plt.xticks([w * 7 * 24 for w in range(10)], ['week {}'.format(w) for w in range(10)]) plt.autoscale(tight=True) plt.plot(fx, f1(fx), linewidth=2) plt.plot(fx, f2(fx), linewidth=2) plt.plot(fx_after, f_after(fx_after), linewidth=3) plt.legend(["d={}".format(f1.order), "d={}".format(f2.order), "d after inflection"], loc="upper left") # plt.grid(True, linestyle="-", color='0.75') plt.show()
def conduit_lengths(network, throats=None, mode='pore'): r""" Return the respective lengths of the conduit components defined by the throat conns P1 - T - P2 Notes ----- mode = 'pore' - uses pore coordinates mode = 'centroid' uses pore and throat centroids """ if throats is None: throats = network.throats() Ps = network['throat.conns'] pdia = network['pore.diameter'] if mode == 'centroid': try: pcentroids = network['pore.centroid'] tcentroids = network['throat.centroid'] if _sp.sum(_sp.isnan(pcentroids)) + _sp.sum(_sp.isnan(tcentroids)) > 0: mode = 'pore' else: plen1 = _sp.sqrt(_sp.sum(_sp.square(pcentroids[Ps[:, 0]] - tcentroids), 1))-network['throat.length']/2 plen2 = _sp.sqrt(_sp.sum(_sp.square(pcentroids[Ps[:, 1]] - tcentroids), 1))-network['throat.length']/2 except KeyError: mode = 'pore' if mode == 'pore': # Find half-lengths of each pore pcoords = network['pore.coords'] # Find the pore-to-pore distance, minus the throat length lengths = _sp.sqrt(_sp.sum(_sp.square(pcoords[Ps[:, 0]] - pcoords[Ps[:, 1]]), 1)) - network['throat.length'] lengths[lengths < 0.0] = 2e-9 # Calculate the fraction of that distance from the first pore try: fractions = pdia[Ps[:, 0]]/(pdia[Ps[:, 0]] + pdia[Ps[:, 1]]) # Don't allow zero lengths # fractions[fractions == 0.0] = 0.5 # fractions[fractions == 1.0] = 0.5 except: fractions = 0.5 plen1 = lengths*fractions plen2 = lengths*(1-fractions) return _sp.vstack((plen1, network['throat.length'], plen2)).T[throats]
def buildAndTestPCAModel(self, noise): modelbuilder = statismo.PCAModelBuilder_vtkPD.Create() model = modelbuilder.BuildNewModel(self.dataManager.GetSampleDataStructure(), noise) self.assertTrue(model.GetNumberOfPrincipalComponents() <= len(self.datafiles)) # we cannot have negative eigenvalues self.assertTrue((model.GetPCAVarianceVector() >= 0).all() == True) self.assertTrue(isnan(model.GetPCAVarianceVector()).any() == False) # we project a dataset into the model and try to restore it. samples = self.dataManager.GetSampleDataStructure() sample = samples[0].GetSample() coeffs_sample = model.ComputeCoefficientsForDataset(sample) restored_sample = model.DrawSample(coeffs_sample) self.assertEqual(sample.GetNumberOfPoints(), restored_sample.GetNumberOfPoints()) self.checkPointsAlmostEqual(sample.GetPoints(), restored_sample.GetPoints(), 100, noise) # check if the scores can be used to restore the data in the datamanager scores = model.GetModelInfo().GetScoresMatrix() for i in xrange(0, scores.shape[1]): sample_from_scores = model.DrawSample(scores[:,i]) sample_from_dm = samples[i].GetSample() self.checkPointsAlmostEqual(sample_from_scores.GetPoints(), sample_from_dm.GetPoints(), 100, noise) return model
def run(self,phase=None): r''' ''' logger.warning('This algorithm can take some time...') graph = self._net.create_adjacency_matrix(data=self._net['throat.length'],sprsfmt='csr') if phase is not None: self._phase = phase if 'throat.occupancy' in self._phase.props(): temp = self._net['throat.length']*(self._phase['throat.occupancy']==1) graph = self._net.create_adjacency_matrix(data=temp,sprsfmt='csr',prop='temp') #self._net.tic() path = spgr.shortest_path(csgraph = graph, method='D', directed = False) #self._net.toc() Px = sp.array(self._net['pore.coords'][:,0],ndmin=2) Py = sp.array(self._net['pore.coords'][:,1],ndmin=2) Pz = sp.array(self._net['pore.coords'][:,2],ndmin=2) Cx = sp.square(Px.T - Px) Cy = sp.square(Py.T - Py) Cz = sp.square(Pz.T - Pz) Ds = sp.sqrt(Cx + Cy + Cz) temp = path/Ds #temp = path temp[sp.isnan(temp)] = 0 temp[sp.isinf(temp)] = 0 return temp
def nmse(yhat, y, min_y, max_y): """ @description Calculates the normalized mean-squared error. @arguments yhat -- 1d array or list of floats -- estimated values of y y -- 1d array or list of floats -- true values min_y, max_y -- float, float -- roughly the min and max; they do not have to be the perfect values of min and max, because they're just here to scale the output into a roughly [0,1] range @return nmse -- float -- normalized mean-squared error """ #base case: no entries if len(yhat) == 0: return 0.0 #base case: both yhat and y are constant, and same values if (max_y == min_y) and (max(yhat) == min(yhat) == max(y) == min(y)): return 0.0 #main case assert max_y > min_y, 'max_y=%g was not > min_y=%g' % (max_y, min_y) yhat_a, y_a = numpy.asarray(yhat), numpy.asarray(y) y_range = float(max_y - min_y) try: result = math.sqrt(numpy.mean(((yhat_a - y_a) / y_range) ** 2)) if scipy.isnan(result): return INF return result except: return INF
def ycorrect(data): """ ycorrect(data) Inputs: data - a flatfield image of the mask Outputs: true_coeffs - A polynomial describing the transformation: y_straight = f(x_ccd,y_ccd) map_coeffs - A polynomial describing the transformation: y_ccd = f(x_cdd,y_straight) """ # Parameters SUMWIDTH = 41 # Width of summing over columns y_axis = data.shape[0] x_axis = data.shape[1] central = x_axis/2 x_min_orig = central - SUMWIDTH/2 x_max_orig = central + SUMWIDTH/2 # Find the 'holes' in the center of the mask to use as the reference # position. midcol = data[:,x_min_orig:x_max_orig].mean(axis=1) central_edges,threshold,star_cutoff = find_holes(midcol) # transform_table would be easier to use as a list.... transform_table = scipy.zeros((1,3),'f4') index = 0 for peak in central_edges: if index: transform_table.resize((index+1,3)) transform_table[index,0] = central transform_table[index,1] = peak transform_table[index,2] = peak index += 1 offset = scipy.zeros(len(central_edges)) x_min = x_min_orig x_max = x_max_orig current_column = central while current_column>SUMWIDTH + 20: current_column = current_column - SUMWIDTH - 10 x_min = x_min - SUMWIDTH - 10 x_max = x_max - SUMWIDTH - 10 comp_array = data[:,x_min:x_max].mean(axis=1) comp_array.clip(min=-1000.,max=star_cutoff) derivative = deriv_1d(comp_array) derivative = ndimage.gaussian_filter1d(derivative,3) derivative = abs(derivative) for i in range(offset.size): if scipy.isnan(offset[i]): continue ref = central_edges[i] + offset[i] start = int(ref) - 6 end = start + 13 if derivative[start:end].max()<threshold: offset[i] = scipy.nan continue fit = find_peak(derivative[start:end]) # If the fit has crazy parameters, skip it if(fit[2]<0 or fit[2]>13 or fit[3]<1 or fit[3]>6): offset[i] = scipy.nan continue peak = fit[2]+float(start) offset[i] = peak - central_edges[i] transform_table.resize((index+1,3)) transform_table[index,0] = current_column transform_table[index,1] = central_edges[i] transform_table[index,2] = peak index += 1 offset = scipy.zeros(offset.size) x_min = x_min_orig x_max = x_max_orig current_column = central while current_column<x_axis - SUMWIDTH - 19: current_column = current_column + SUMWIDTH + 10 x_min = x_min + SUMWIDTH + 10 x_max = x_max + SUMWIDTH + 10 comp_array = data[:,x_min:x_max].mean(axis=1) comp_array.clip(min=-1000.,max=star_cutoff) derivative = deriv_1d(comp_array) derivative = ndimage.gaussian_filter1d(derivative,3) derivative = abs(derivative) for i in range(offset.size): if scipy.isnan(offset[i]): continue ref = central_edges[i] + offset[i] start = int(round(ref)) - 6 end = start + 13 if derivative[start:end].max()<threshold: offset[i] = scipy.nan continue fit = find_peak(derivative[start:end]) if(fit[2]<0 or fit[2]>13 or fit[3]<1 or fit[3]>6): offset[i] = scipy.nan continue peak = fit[2]+float(start) offset[i] = peak - central_edges[i] transform_table.resize((index+1,3)) transform_table[index,0] = current_column transform_table[index,1] = central_edges[i] transform_table[index,2] = peak index += 1 true_coeffs = special_functions.lsqfit(transform_table,"chebyshev",4,4) temp = transform_table[:,1].copy() transform_table[:,1] = transform_table[:,2].copy() transform_table[:,2] = temp.copy() map_coeffs = special_functions.lsqfit(transform_table,"chebyshev",4,4) return true_coeffs,map_coeffs
def KLSampling(ratio, data): """ Computes KL(q||p) by using samples of q. Some very elementary checks are made to make sure the numbers returned make sense. The estimator used has the same properties (and issues) as when one is doing importance sampling, i.e., problems with estimating the ratio of partition functions. Arguments:: ratio: The ratio of the normalized (or un-normalized) pdfs or pmfs of the form p/q. data: Data to compute the estimator on. Returns:: est: estimate of the KL divergence. Examples:: The case of equal distributions. >>> from scipy import exp >>> rat = lambda x: exp(-(x)**2/2)/exp(-(x)**2/2) >>> data = scipy.randn(1000) # sample from normal with mean 1 >>> est = KLSampling(rat, data) >>> est<1e-7 True Different means, same deviations. >>> mu = 0.4 >>> rat = lambda x: exp(-(x-mu)**2/2)/exp(-(x-1.0)**2/2) >>> data = scipy.randn(100000)+1 >>> est = KLSampling(rat, data) >>> abs(est-KLNormal(mu,1.0,1,1.0))<1 True Testing if mapping works well for distributions of more than one parameter. >>> rat = lambda x: exp(-(x[0]+x[1])/2**2) >>> data = scipy.randn(100,2) >>> KLSampling(rat,data)<0.6 True """ dim = len(data.shape) est_a = 0.0 est_b = 0.0 if dim == 1: n = len(data) est_a = sum([log(1.0 / ratio(x)) for x in data]) est_b = sum([ratio(x) for x in data]) # for i in xrange(n): # val = ratio(data[i]) # est_a = est_a+log(1.0/val) # est_b = est_b+val if scipy.isnan(est_a): raise ValueError("est_a is nan") if scipy.isnan(est_b): raise ValueError("est_b is nan") est = est_a / n + log(est_b / n) # total estimate else: # nxm format assumed, where every row accounts for data # every column for variables n = scipy.size(data, 0) for i in xrange(n): val = ratio(data[i, :]) est_a = est_a + log(1.0 / val) est_b = est_b + val if scipy.isnan(est_a): raise ValueError("est_a is nan") if scipy.isnan(est_b): raise ValueError("est_b is nan") est = est_a / n + log(est_b / n) # total estimate if est < 0 or est_b < 0: raise ValueError("Insufficient data to converge.") return est
import sys import scipy as sp data = sp.genfromtxt("E:\python\data\ch01\data\web_traffic.tsv", delimiter="\t") print(data[:10]) x = data[:, 0] y = data[:, 1] sp.sum(sp.isnan(y)) x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] import matplotlib.pyplot as plt plt.scatter(x, y) plt.title("Web traffic over the last month") plt.xlabel("Time") plt.ylabel("Hits/hour") plt.xticks([w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)])
lst = os.listdir('/big_disk/ajoshi/HCP5') rho1 = 0 rho1rot = 0 rho2 = 0 rho2rot = 0 # lst = [lst[0]] diffbefore = 0 diffafter = 0 sub = lst[0] vrest1 = scipy.io.loadmat('/big_disk/ajoshi/coding_ground/epilepsy/\ NorthShoreLIJ/0019002/fmri_tnlm_5_reduce3_v2.mat') # h5py.File(fname1); data = vrest1['func_right'] indx = sp.isnan(data) data[indx] = 0 vrest = data m = np.mean(vrest, 1) vrest = vrest - m[:, None] s = np.std(vrest, 1) + 1e-116 vrest1 = vrest / s[:, None] rho1 = 0 rho1rot = 0 diffafter = 0 diffbefore = 0 lst = glob.glob('/big_disk/ajoshi/fcon_1000/Beijing/sub*') nsub = 0
def initZ(self, pmean, pvar, qmean, qvar, qE=None, qE2=None, covariates=None, scale_covariates=None): """Method to initialise the latent variables PARAMETERS ---------- pmean: pvar: qmean qvar qE qE2 covariates: nd array matrix of covariates with dimensions (nsamples,ncovariates) scale_covariates: """ # Initialise mean of the Q distribution if qmean is not None: if isinstance(qmean, str): if qmean == "random": # Random initialisation of latent variables qmean = stats.norm.rvs(loc=0, scale=1, size=(self.N, self.K)) elif qmean == "orthogonal": # Latent variables are initialised randomly but ensuring orthogonality pca = sklearn.decomposition.PCA(n_components=self.K, copy=True, whiten=True) pca.fit( stats.norm.rvs(loc=0, scale=1, size=(self.N, 9999)).T) qmean = pca.components_.T elif qmean == "pca": # Latent variables are initialised from PCA in the concatenated matrix pca = sklearn.decomposition.PCA(n_components=self.K, copy=True, whiten=True) pca.fit(s.concatenate(self.data, axis=0).T) qmean = pca.components_.T elif isinstance(qmean, s.ndarray): assert qmean.shape == (self.N, self.K) elif isinstance(qmean, (int, float)): qmean = s.ones((self.N, self.K)) * qmean else: print("Wrong initialisation for Z") exit() # Add covariates if covariates is not None: assert scale_covariates != None, "If you use covariates also define data_opts['scale_covariates']" # Select indices for covaraites idx_covariates = s.array(range(covariates.shape[1])) # Center and scale the covariates to match the prior distribution N(0,1) # to-do: this needs to be improved to take the particular mean and var into account # covariates[scale_covariates] = (covariates - covariates.mean(axis=0)) / covariates.std(axis=0) scale_covariates = s.array(scale_covariates) covariates[:, scale_covariates] = ( covariates[:, scale_covariates] - s.nanmean(covariates[:, scale_covariates], axis=0)) / s.nanstd( covariates[:, scale_covariates], axis=0) # Set to zero the missing values in the covariates covariates[s.isnan(covariates)] = 0. qmean[:, idx_covariates] = covariates else: idx_covariates = None # Initialise the node # self.Z = Constant_Node(dim=(self.N,self.K), value=qmean) self.Z = Z_Node(dim=(self.N, self.K), pmean=s.ones((self.N, self.K)) * pmean, pvar=s.ones((self.K, )) * pvar, qmean=s.ones((self.N, self.K)) * qmean, qvar=s.ones((self.N, self.K)) * qvar, qE=qE, qE2=qE2, idx_covariates=idx_covariates) self.nodes["Z"] = self.Z
def _intersections(x1, y1, x2, y2): """X0,Y0 = intersections(X1,Y1,X2,Y2) INTERSECTIONS Intersections of curves. Computes the (x,y) locations where two curves intersect. The curves can be broken with NaNs or have vertical segments. Example: [X0,Y0] = intersections(X1,Y1,X2,Y2); where X1 and Y1 are equal-length vectors of at least two points and represent curve 1. Similarly, X2 and Y2 represent curve 2. X0 and Y0 are column vectors containing the points at which the two curves intersect. The algorithm can return two additional vectors that indicate which segment pairs contain intersections and where they are: [X0,Y0,I,J] = intersections(X1,Y1,X2,Y2); For each element of the vector I, I(k) = (segment number of (X1,Y1)) + (how far along this segment the intersection is). For example, if I(k) = 45.25 then the intersection lies a quarter of the way between the line segment connecting (X1(45),Y1(45)) and (X1(46),Y1(46)). Similarly for the vector J and the segments in (X2,Y2). Version: 1.10, 25 February 2008 Converted to Python October 2010 by Jeffrey Bush [email protected] Author: Douglas M. Schwarz Email: dmschwarz=ieee*org, dmschwarz=urgrad*rochester*edu Real_email = regexprep(Email,{'=','*'},{'@','.'}) Theory of operation: Given two line segments, L1 and L2, L1 endpoints: (x1(1),y1(1)) and (x1(2),y1(2)) L2 endpoints: (x2(1),y2(1)) and (x2(2),y2(2)) we can write four equations with four unknowns and then solve them. The four unknowns are t1, t2, x0 and y0, where (x0,y0) is the intersection of L1 and L2, t1 is the distance from the starting point of L1 to the intersection relative to the length of L1 and t2 is the distance from the starting point of L2 to the intersection relative to the length of L2. So, the four equations are (x1(2) - x1(1))*t1 = x0 - x1(1) (x2(2) - x2(1))*t2 = x0 - x2(1) (y1(2) - y1(1))*t1 = y0 - y1(1) (y2(2) - y2(1))*t2 = y0 - y2(1) Rearranging and writing in matrix form, [x1(2)-x1(1) 0 -1 0; [t1; [-x1(1); 0 x2(2)-x2(1) -1 0; * t2; = -x2(1); y1(2)-y1(1) 0 0 -1; x0; -y1(1); 0 y2(2)-y2(1) 0 -1] y0] -y2(1)] Let's call that A*T = B. We can solve for T with T = A\B. Once we have our solution we just have to look at t1 and t2 to determine whether L1 and L2 intersect. If 0 <= t1 < 1 and 0 <= t2 < 1 then the two line segments cross and we can include (x0,y0) in the output. In principle, we have to perform this computation on every pair of line segments in the input data. This can be quite a large number of pairs so we will reduce it by doing a simple preliminary check to eliminate line segment pairs that could not possibly cross. The check is to look at the smallest enclosing rectangles (with sides parallel to the axes) for each line segment pair and see if they overlap. If they do then we have to compute t1 and t2 (via the A\B computation) to see if the line segments cross, but if they don't then the line segments cannot cross. In a typical application, this technique will eliminate most of the potential line segment pairs. """ # x1 and y1 must be vectors with same number of points (at least 2). if sp.sum(sp.size(x1) > 1) != 1 or sp.sum( sp.size(y1) > 1) != 1 or len(x1) != len(y1): raise ValueError( 'X1 and Y1 must be equal-length vectors of at least 2 points.') # x2 and y2 must be vectors with same number of points (at least 2). if sp.sum(sp.size(x2) > 1) != 1 or sp.sum( sp.size(y2) > 1) != 1 or len(x2) != len(y2): raise ValueError( 'X2 and Y2 must be equal-length vectors of at least 2 points.') # Compute number of line segments in each curve and some differences we'll # need later. n1 = len(x1) - 1 n2 = len(x2) - 1 xy1 = sp.column_stack((x1, y1)) xy2 = sp.column_stack((x2, y2)) dxy1 = sp.diff(xy1, axis=0) dxy2 = sp.diff(xy2, axis=0) # Determine the combinations of i and j where the rectangle enclosing the # i'th line segment of curve 1 overlaps with the rectangle enclosing the # j'th line segment of curve 2. i, j = sp.nonzero( sp.logical_and( sp.logical_and( sp.logical_and( sp.tile(sp.minimum(x1[0:-1], x1[1:]), (n2, 1)).T <= sp.tile(sp.maximum(x2[0:-1], x2[1:]), (n1, 1)), sp.tile(sp.maximum(x1[0:-1], x1[1:]), (n2, 1)).T >= sp.tile(sp.minimum(x2[0:-1], x2[1:]), (n1, 1))), sp.tile(sp.minimum(y1[0:-1], y1[1:]), (n2, 1)).T <= sp.tile(sp.maximum(y2[0:-1], y2[1:]), (n1, 1))), sp.tile(sp.maximum(y1[0:-1], y1[1:]), (n2, 1)).T >= sp.tile(sp.minimum(y2[0:-1], y2[1:]), (n1, 1)))) i = sp.copy(i) # make the arrays writable j = sp.copy(j) # Find segments pairs which have at least one vertex = NaN and remove them. # This line is a fast way of finding such segment pairs. We take # advantage of the fact that NaNs propagate through calculations, in # particular subtraction (in the calculation of dxy1 and dxy2, which we # need anyway) and addition. remove = sp.isnan(sp.sum(dxy1[i, :] + dxy2[j, :], axis=1)) i[remove] = [] j[remove] = [] # Initialize matrices. We'll put the T's and B's in matrices and use them # one column at a time. AA is a 3-D extension of A where we'll use one # plane at a time. n = len(i) T = sp.zeros((4, n)) AA = sp.zeros((4, 4, n)) AA[[0, 1], 2, :] = -1 AA[[2, 3], 3, :] = -1 AA[[0, 2], 0, :] = dxy1[i, :].T AA[[1, 3], 1, :] = dxy2[j, :].T B = -sp.array([x1[i], x2[j], y1[i], y2[j]]) # Loop through possibilities. Trap singularity warning and then use # lastwarn to see if that plane of AA is near singular. Process any such # segment pairs to determine if they are colinear (overlap) or merely # parallel. That test consists of checking to see if one of the endpoints # of the curve 2 segment lies on the curve 1 segment. This is done by # checking the cross product # # (x1(2),y1(2)) - (x1(1),y1(1)) x (x2(2),y2(2)) - (x1(1),y1(1)). # # If this is close to zero then the segments overlap. for k in sp.arange(n): L, U = lin.lu(AA[:, :, k], True) T[:, k] = lin.solve(U, lin.solve(L, B[:, k])) # Find where t1 and t2 are between 0 and 1 and return the corresponding # x0 and y0 values. in_range = sp.logical_and( sp.logical_and(sp.logical_and(T[0, :] >= 0, T[1, :] >= 0), T[0, :] < 1), T[1, :] < 1) x0 = T[2, in_range].T y0 = T[3, in_range].T return x0, y0
data = sp.genfromtxt( os.path.expanduser( "~/Python_src/src/MachineLearningSystem/exdata/ch01/data/web_traffic.tsv" )) print(data.shape) print(data[:10]) x = data[:, 0] y = data[:, 1] #NAN値の処理 x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] plt.scatter(x, y) #fp1=切片と変数,residuals=残差 , full指定で残差等を取得可能, 1は次数を示す fp1, residuals, rank, s, rcond = sp.polyfit(x, y, 1, full=True) f1 = sp.poly1d(fp1) #モデルパラメータからモデル関数の作成f1(x)= ~ print(error(f1, x, y)) print("Model parameters: %s", fp1) #多項式のあてはめ f2p = sp.polyfit(x, y, 2) print(f2p) #Pyplotの設定
def extract(data, varimg, width=WIDTH, nsig=NSIG, noise=NOISE): WIDTH = width NSIG = nsig NOISE = noise data = data.copy() spectra = [] # Replace nan with zero data[scipy.isnan(data)] = 0. varimg[scipy.isnan(varimg)] = 0. # Create model of real flux. We ignore the slit ends, which may have # artifacts from the resampling. slit = data[:, 8:-8].astype(scipy.float32) var = varimg[:, 8:-8] # OK...so negative-variance also isn't good; set these pixels to zero var[var < 0] = 0 # Create noise models sigmaimg = slit / scipy.sqrt(var) highpix = scipy.where(sigmaimg > 1.5, sigmaimg, 0.) source_columns = highpix.sum(axis=0) # MASKING DISABLED (this would take only columns with lotsa flux...) # mask = scipy.where(source_columns>4.,1.,scipy.nan) mask = source_columns * 0. # Condition 1, dealing with bad pixels if (var == 0).any(): cond = var == 0 var[cond] = scipy.nan slit[cond] = scipy.nan mask = scipy.where(cond, 0, 1) flux = scipy.nansum(slit / var, axis=1) / scipy.nansum(1. / var, axis=1) noise = scipy.sqrt(scipy.nansum(var, axis=1)) / mask.sum(axis=1) # Condition 2, no masking elif scipy.nansum(mask) == 0: flux = (slit / var).sum(axis=1) / (1. / var).sum(axis=1) noise = scipy.sqrt(var.sum(axis=1)) / mask.size # Condition 3, masking else: fluxmodel = slit * mask noisemodel = var * mask noise = scipy.sqrt(scipy.nansum(noisemodel, axis=1)) / scipy.nansum(mask) flux = stats.stats.nanmean(fluxmodel, axis=1) # A smooth S/N estimate for the slit # sig2noise = ndimage.gaussian_filter1d(flux,1)/noise row = scipy.arange(flux.size) model = flux.copy() nspec = 10 # Maximum number of attempts while nspec: nspec -= 1 # Fit a gaussian around the peak of the S/N model start = model.argmax() - WIDTH end = model.argmax() + WIDTH + 1 if start < 0: start = 0. if end > model.size: end = model.size fitarr = model[start:end] p = scipy.zeros(4) p[1] = fitarr.max() p[2] = fitarr.argmax() p[3] = 2. fit, val = special_functions.ngaussfit(fitarr, p) chi2 = val / (fitarr.size - 3) fit[2] += start # If the centroid doesn't lie on the slit, get use the edge pix midcol = fit[2].round() if midcol >= flux.size: midcol = flux.size - 1 elif midcol < 0: midcol = 0 # Require a reasonable S/N and width if fit[3] > fitarr.size / 2. or fit[3] < 0.85: break elif fit[0] > 0 and fit[1] < NOISE * noise[midcol]: break elif fit[0] < 0 and fit[1] - fit[0] < NOISE * noise[midcol]: break else: fit[1] += fit[0] fit[0] = 0. # Subtract away a model of the source source = special_functions.ngauss(row, fit) model -= scipy.where(source > noise, source, 0.) # Skip residuals! if fit[2] < flux.size and fit[1] < scipy.sqrt(flux[fit[2]]): continue fit[1] = 1. weight = special_functions.ngauss(row, fit) cond = (row > fit[2] - fit[3] * NSIG) & (row < fit[2] + fit[3] * NSIG) weight = scipy.where(cond, weight, 0) weight /= weight.sum() spec = weight * data.T spec = spec.sum(axis=1) varspec = weight * varimg.T varspec = varspec.sum(axis=1) spec[varspec == 0] = 0. smooth = signal.wiener(spec, FILTSIZE, varspec) smooth[scipy.isnan(smooth)] = 0. spectra.append([fit, spec, smooth, varspec]) return spectra
def bulk_diffusion(physics, phase, network, pore_molar_density='pore.molar_density', pore_diffusivity='pore.diffusivity', pore_area='pore.area', pore_diameter='pore.diameter', throat_area='throat.area', throat_length='throat.length', throat_diameter='throat.diameter', calc_pore_len=True, **kwargs): r""" Calculate the diffusive conductance of conduits in network, where a conduit is ( 1/2 pore - full throat - 1/2 pore ) based on the areas Parameters ---------- network : OpenPNM Network Object phase : OpenPNM Phase Object The phase of interest Notes ----- (1) This function requires that all the necessary phase properties already be calculated. (2) This function calculates the specified property for the *entire* network then extracts the values for the appropriate throats at the end. """ #Get Nt-by-2 list of pores connected to each throat Ps = network['throat.conns'] #Get properties in every pore in the network parea = network[pore_area] pdia = network[pore_diameter] #Get the properties of every throat tarea = network[throat_area] tlen = network[throat_length] #Interpolate pore phase property values to throats cp = phase[pore_molar_density] ct = phase.interpolate_data(data=cp) DABp = phase[pore_diffusivity] DABt = phase.interpolate_data(data=DABp) if calc_pore_len: lengths = misc.conduit_lengths(network, mode='centroid') plen1 = lengths[:, 0] plen2 = lengths[:, 2] else: plen1 = (0.5 * pdia[Ps[:, 0]]) plen2 = (0.5 * pdia[Ps[:, 1]]) #remove any non-positive lengths plen1[plen1 <= 0] = 1e-12 plen2[plen2 <= 0] = 1e-12 #Find g for half of pore 1 gp1 = ct * DABt * parea[Ps[:, 0]] / plen1 gp1[_sp.isnan(gp1)] = _sp.inf gp1[~(gp1 > 0)] = _sp.inf # Set 0 conductance pores (boundaries) to inf #Find g for half of pore 2 gp2 = ct * DABt * parea[Ps[:, 1]] / plen2 gp2[_sp.isnan(gp2)] = _sp.inf gp2[~(gp2 > 0)] = _sp.inf # Set 0 conductance pores (boundaries) to inf #Find g for full throat #remove any non-positive lengths tlen[tlen <= 0] = 1e-12 gt = ct * DABt * tarea / tlen value = (1 / gt + 1 / gp1 + 1 / gp2)**(-1) value = value[phase.throats(physics.name)] return value
def bedTopo(east_grd_path, slope_grd_path, thickness_txt_path): import math import matplotlib import matplotlib.pyplot import os import scipy from scipy.io import netcdf from scipy.sparse import lil_matrix import subprocess assert os.path.exists( east_grd_path), "\n***** ERROR: " + east_grd_path + " does not exist\n" assert os.path.exists( slope_grd_path ), "\n***** ERROR: " + slope_grd_path + " does not exist\n" assert os.path.exists( thickness_txt_path ), "\n***** ERROR: " + thickness_txt_path + " does not exist\n" north_grd_path = east_grd_path.replace("east", "north") angles_grd_path = east_grd_path.replace("eastxyz", "angles") mag_grd_path = east_grd_path.replace("eastxyz", "mag") if not os.path.exists(angles_grd_path): cmd = "\ngrdmath " + north_grd_path + " " + east_grd_path + " ATAN2 --IO_NC4_CHUNK_SIZE=c = " + angles_grd_path + "\n" subprocess.call(cmd, shell=True) cmd = "\ngrdclip " + angles_grd_path + " -Sa0.7853981633974483/NaN -Sb0/NaN -Gone.grd\n" cmd += "\ngrdclip " + angles_grd_path + " -Sa1.5707963267948966/NaN -Sb0.7853981633974483/NaN -Gtwo.grd\n" cmd += "\ngrdclip " + angles_grd_path + " -Sa2.356194490192345/NaN -Sb1.5707963267948966/NaN -Gthree.grd\n" cmd += "\ngrdclip " + angles_grd_path + " -Sa3.141592653589793/NaN -Sb2.356194490192345/NaN -Gfour.grd\n" cmd += "\ngrdclip " + angles_grd_path + " -Sa-2.356194490192345/NaN -Gfive.grd\n" cmd += "\ngrdclip " + angles_grd_path + " -Sa-1.5707963267948966/NaN -Sb-2.356194490192345/NaN -Gsix.grd\n" cmd += "\ngrdclip " + angles_grd_path + " -Sa-0.7853981633974483/NaN -Sb-1.5707963267948966/NaN -Gseven.grd\n" cmd += "\ngrdclip " + angles_grd_path + " -Sa0/NaN -Sb-0.7853981633974483/NaN -Geight.grd\n" cmd += "\ngrdmath one.grd two.grd AND = u.grd\n" cmd += "\ngrdmath three.grd u.grd AND = u.grd\n" cmd += "\ngrdmath four.grd u.grd AND = u.grd\n" cmd += "\ngrdmath five.grd six.grd AND = d.grd\n" cmd += "\ngrdmath seven.grd d.grd AND = d.grd\n" cmd += "\ngrdmath eight.grd d.grd AND = d.grd\n" cmd += "\ngrdmath three.grd four.grd AND = l.grd\n" cmd += "\ngrdmath five.grd l.grd AND = l.grd\n" cmd += "\ngrdmath six.grd l.grd AND = l.grd\n" cmd += "\ngrdmath one.grd two.grd AND = r.grd\n" cmd += "\ngrdmath eight.grd r.grd AND = r.grd\n" cmd += "\ngrdmath seven.grd r.grd AND = r.grd\n" cmd += "\ngrdmath two.grd three.grd AND = ul.grd\n" cmd += "\ngrdmath four.grd ul.grd AND = ul.grd\n" cmd += "\ngrdmath five.grd ul.grd AND = ul.grd\n" cmd += "\ngrdmath one.grd two.grd AND = ur.grd\n" cmd += "\ngrdmath three.grd ur.grd AND = ur.grd\n" cmd += "\ngrdmath eight.grd ur.grd AND = ur.grd\n" cmd += "\ngrdmath four.grd five.grd AND = dl.grd\n" cmd += "\ngrdmath six.grd dl.grd AND = dl.grd\n" cmd += "\ngrdmath seven.grd dl.grd AND = dl.grd\n" cmd += "\ngrdmath one.grd six.grd AND = dr.grd\n" cmd += "\ngrdmath seven.grd dr.grd AND = dr.grd\n" cmd += "\ngrdmath eight.grd dr.grd AND = dr.grd\n" cmd += "\ngrdmath 1.5707963267948966 u.grd SUB = u.grd\n" cmd += "\ngrdmath u.grd ABS = u.grd\n" cmd += "\ngrdmath u.grd COS = u.grd\n" cmd += "\ngrdmath u.grd " + mag_grd_path + " MUL --IO_NC4_CHUNK_SIZE=c = u.grd" cmd += "\ngrdmath -1.5707963267948966 d.grd SUB = d.grd\n" cmd += "\ngrdmath d.grd ABS = d.grd\n" cmd += "\ngrdmath d.grd COS = d.grd\n" cmd += "\ngrdmath d.grd " + mag_grd_path + " MUL --IO_NC4_CHUNK_SIZE=c = d.grd" cmd += "\ngrdmath 3.141592653589793 l.grd SUB = l.grd\n" cmd += "\ngrdmath l.grd ABS = l.grd\n" cmd += "\ngrdmath l.grd COS = l.grd\n" cmd += "\ngrdmath l.grd " + mag_grd_path + " MUL --IO_NC4_CHUNK_SIZE=c = l.grd" cmd += "\ngrdmath 0 r.grd SUB = r.grd\n" cmd += "\ngrdmath r.grd ABS = r.grd\n" cmd += "\ngrdmath r.grd COS = r.grd\n" cmd += "\ngrdmath r.grd " + mag_grd_path + " MUL --IO_NC4_CHUNK_SIZE=c = r.grd" cmd += "\ngrdmath 2.356194490192345 ul.grd SUB = ul.grd\n" cmd += "\ngrdmath ul.grd ABS = ul.grd\n" cmd += "\ngrdmath ul.grd COS = ul.grd\n" cmd += "\ngrdmath ul.grd " + mag_grd_path + " MUL --IO_NC4_CHUNK_SIZE=c = ul.grd" cmd += "\ngrdmath 0.7853981633974483 ur.grd SUB = ur.grd\n" cmd += "\ngrdmath ur.grd ABS = ur.grd\n" cmd += "\ngrdmath ur.grd COS = ur.grd\n" cmd += "\ngrdmath ur.grd " + mag_grd_path + " MUL --IO_NC4_CHUNK_SIZE=c = ur.grd" cmd += "\ngrdmath -2.356194490192345 dl.grd SUB = dl.grd\n" cmd += "\ngrdmath dl.grd ABS = dl.grd\n" cmd += "\ngrdmath dl.grd COS = dl.grd\n" cmd += "\ngrdmath dl.grd " + mag_grd_path + " MUL --IO_NC4_CHUNK_SIZE=c = dl.grd" cmd += "\ngrdmath -0.7853981633974483 dr.grd SUB = dr.grd\n" cmd += "\ngrdmath dr.grd ABS = dr.grd\n" cmd += "\ngrdmath dr.grd COS = dr.grd\n" cmd += "\ngrdmath dr.grd " + mag_grd_path + " MUL --IO_NC4_CHUNK_SIZE=c = dr.grd" subprocess.call(cmd, shell=True) f = netcdf.netcdf_file("u.grd", "r", False) x = f.variables["x"].data y = f.variables["y"].data u = f.variables["z"].data[:] f.close() f = netcdf.netcdf_file("d.grd", "r", False) x = f.variables["x"].data y = f.variables["y"].data d = f.variables["z"].data[:] f.close() f = netcdf.netcdf_file("l.grd", "r", False) x = f.variables["x"].data y = f.variables["y"].data l = f.variables["z"].data[:] f.close() f = netcdf.netcdf_file("r.grd", "r", False) x = f.variables["x"].data y = f.variables["y"].data r = f.variables["z"].data[:] f.close() f = netcdf.netcdf_file("ul.grd", "r", False) x = f.variables["x"].data y = f.variables["y"].data ul = f.variables["z"].data[:] f.close() f = netcdf.netcdf_file("ur.grd", "r", False) x = f.variables["x"].data y = f.variables["y"].data ur = f.variables["z"].data[:] f.close() f = netcdf.netcdf_file("dl.grd", "r", False) x = f.variables["x"].data y = f.variables["y"].data dl = f.variables["z"].data[:] f.close() f = netcdf.netcdf_file("dr.grd", "r", False) x = f.variables["x"].data y = f.variables["y"].data dr = f.variables["z"].data[:] f.close() f = netcdf.netcdf_file(mag_grd_path, "r", False) x = f.variables["x"].data y = f.variables["y"].data speeds = f.variables["z"].data[:] f.close() f = netcdf.netcdf_file(slope_grd_path, "r", False) x = f.variables["x"].data y = f.variables["y"].data slopes = f.variables["z"].data[:] f.close() f = netcdf.netcdf_file(angles_grd_path, "r", False) x = f.variables["x"].data y = f.variables["y"].data angles = f.variables["z"].data[:] f.close() width = f.dimensions["x"] length = f.dimensions["y"] min_x = min(x) max_x = max(x) min_y = min(y) max_y = max(y) inc = int((max(x) - min(x)) / (width - 1)) # Read in ice-only pixels # f = netcdf.netcdf_file("ice_only.grd","r",False); f = netcdf.netcdf_file(east_grd_path, "r", False) x = f.variables["x"].data y = f.variables["y"].data ice_vals = f.variables["z"].data[:] f.close() # Read in thicknesses, initialize fluxes thicknesses = {} f_lons = {} f_lats = {} dr_stresses = {} basal_drags = {} fluxes = scipy.zeros((length, width)) locked = scipy.zeros((length, width)) infile = open(thickness_txt_path, "r") for line in infile: utm_x, utm_y, thickness = line.strip().split() # j = str(int(math.floor((float(utm_x) - float(min_x)) / int(inc)))); # i = str(int(math.floor((float(utm_y) - float(min_y)) / int(inc)))); j = str(int(round((float(utm_x) - float(min_x)) / int(inc)))) i = str(int(round((float(utm_y) - float(min_y)) / int(inc)))) thicknesses[i + " " + j] = float(thickness) fluxes[int(i), int(j)] = speeds[int(i), int(j)] * float(thickness) locked[int(i), int(j)] = 1 # print(str(int(j) * int(inc) + float(min_x)) + " " + str(int(i) * int(inc) + float(min_y)) + " " + thickness); infile.close() # Iteratively calculate fluxes, thicknesses max_iterations = 50 cur_iteration = 0 cs1 = 0.0 cs2 = 0.0 todo = thicknesses.keys() while cur_iteration < max_iterations: tolock = {} inputs = {} outputs = {} for coord in todo: str_i, str_j = coord.split() y_i = int(str_i) x_i = int(str_j) cs1 += fluxes[y_i, x_i] in_total = 0.0 out_total = 0.0 cs3 = 0.0 factor = 4 # Calculate input fluxes if locked[y_i - 1, x_i] < 1 and not scipy.isnan( ice_vals[y_i - 1, x_i]) and not scipy.isnan(u[y_i - 1, x_i]): in_total += u[y_i - 1, x_i]**factor if locked[y_i + 1, x_i] < 1 and not scipy.isnan( ice_vals[y_i + 1, x_i]) and not scipy.isnan(d[y_i + 1, x_i]): in_total += d[y_i + 1, x_i]**factor if locked[y_i, x_i + 1] < 1 and not scipy.isnan( ice_vals[y_i, x_i + 1]) and not scipy.isnan(l[y_i, x_i + 1]): in_total += l[y_i, x_i + 1]**factor if locked[y_i, x_i - 1] < 1 and not scipy.isnan( ice_vals[y_i, x_i - 1]) and not scipy.isnan(r[y_i, x_i - 1]): in_total += r[y_i, x_i - 1]**factor if locked[y_i - 1, x_i + 1] < 1 and not scipy.isnan( ice_vals[y_i - 1, x_i + 1]) and not scipy.isnan( ul[y_i - 1, x_i + 1]): in_total += ul[y_i - 1, x_i + 1]**factor if locked[y_i - 1, x_i - 1] < 1 and not scipy.isnan( ice_vals[y_i - 1, x_i - 1]) and not scipy.isnan( ur[y_i - 1, x_i - 1]): in_total += ur[y_i - 1, x_i - 1]**factor if locked[y_i + 1, x_i + 1] < 1 and not scipy.isnan( ice_vals[y_i + 1, x_i + 1]) and not scipy.isnan( dl[y_i + 1, x_i + 1]): in_total += dl[y_i + 1, x_i + 1]**factor if locked[y_i + 1, x_i - 1] < 1 and not scipy.isnan( ice_vals[y_i + 1, x_i - 1]) and not scipy.isnan( dr[y_i + 1, x_i - 1]): in_total += dr[y_i + 1, x_i - 1]**factor if locked[y_i - 1, x_i] < 1 and not scipy.isnan( ice_vals[y_i - 1, x_i]) and not scipy.isnan(u[y_i - 1, x_i]): fluxes[y_i - 1, x_i] += fluxes[y_i, x_i] * (u[y_i - 1, x_i]**factor / in_total) tolock[str(y_i - 1) + " " + str(x_i)] = True inputs[str(y_i - 1) + " " + str(x_i)] = True cs3 += (u[y_i - 1, x_i]**factor / in_total) if locked[y_i + 1, x_i] < 1 and not scipy.isnan( ice_vals[y_i + 1, x_i]) and not scipy.isnan(d[y_i + 1, x_i]): fluxes[y_i + 1, x_i] += fluxes[y_i, x_i] * (d[y_i + 1, x_i]**factor / in_total) tolock[str(y_i + 1) + " " + str(x_i)] = True inputs[str(y_i + 1) + " " + str(x_i)] = True cs3 += (d[y_i + 1, x_i]**factor / in_total) if locked[y_i, x_i + 1] < 1 and not scipy.isnan( ice_vals[y_i, x_i + 1]) and not scipy.isnan(l[y_i, x_i + 1]): fluxes[y_i, x_i + 1] += fluxes[y_i, x_i] * (l[y_i, x_i + 1]**factor / in_total) tolock[str(y_i) + " " + str(x_i + 1)] = True inputs[str(y_i) + " " + str(x_i + 1)] = True cs3 += (l[y_i, x_i + 1]**factor / in_total) if locked[y_i, x_i - 1] < 1 and not scipy.isnan( ice_vals[y_i, x_i - 1]) and not scipy.isnan(r[y_i, x_i - 1]): fluxes[y_i, x_i - 1] += fluxes[y_i, x_i] * (r[y_i, x_i - 1]**factor / in_total) tolock[str(y_i) + " " + str(x_i - 1)] = True inputs[str(y_i) + " " + str(x_i - 1)] = True cs3 += (r[y_i, x_i - 1]**factor / in_total) if locked[y_i - 1, x_i + 1] < 1 and not scipy.isnan( ice_vals[y_i - 1, x_i + 1]) and not scipy.isnan( ul[y_i - 1, x_i + 1]): fluxes[y_i - 1, x_i + 1] += fluxes[y_i, x_i] * (ul[y_i - 1, x_i + 1]**factor / in_total) tolock[str(y_i - 1) + " " + str(x_i + 1)] = True inputs[str(y_i - 1) + " " + str(x_i + 1)] = True cs3 += (ul[y_i - 1, x_i + 1]**factor / in_total) if locked[y_i - 1, x_i - 1] < 1 and not scipy.isnan( ice_vals[y_i - 1, x_i - 1]) and not scipy.isnan( ur[y_i - 1, x_i - 1]): fluxes[y_i - 1, x_i - 1] += fluxes[y_i, x_i] * (ur[y_i - 1, x_i - 1]**factor / in_total) tolock[str(y_i - 1) + " " + str(x_i - 1)] = True inputs[str(y_i - 1) + " " + str(x_i - 1)] = True cs3 += (ur[y_i - 1, x_i - 1]**factor / in_total) if locked[y_i + 1, x_i + 1] < 1 and not scipy.isnan( ice_vals[y_i + 1, x_i + 1]) and not scipy.isnan( dl[y_i + 1, x_i + 1]): fluxes[y_i + 1, x_i + 1] += fluxes[y_i, x_i] * (dl[y_i + 1, x_i + 1]**factor / in_total) tolock[str(y_i + 1) + " " + str(x_i + 1)] = True inputs[str(y_i + 1) + " " + str(x_i + 1)] = True cs3 += (dl[y_i + 1, x_i + 1]**factor / in_total) if locked[y_i + 1, x_i - 1] < 1 and not scipy.isnan( ice_vals[y_i + 1, x_i - 1]) and not scipy.isnan( dr[y_i + 1, x_i - 1]): fluxes[y_i + 1, x_i - 1] += fluxes[y_i, x_i] * (dr[y_i + 1, x_i - 1]**factor / in_total) tolock[str(y_i + 1) + " " + str(x_i - 1)] = True inputs[str(y_i + 1) + " " + str(x_i - 1)] = True cs3 += (dr[y_i + 1, x_i - 1]**factor / in_total) # Calculate output fluxes """ if locked[y_i-1,x_i] < 1 and not scipy.isnan(ice_vals[y_i-1,x_i]) and not scipy.isnan(d[y_i,x_i]) and (str(y_i-1) + " " + str(x_i)) not in inputs: out_total += d[y_i,x_i]**factor; if locked[y_i+1,x_i] < 1 and not scipy.isnan(ice_vals[y_i+1,x_i]) and not scipy.isnan(u[y_i,x_i]) and (str(y_i+1) + " " + str(x_i)) not in inputs: out_total += u[y_i,x_i]**factor; if locked[y_i,x_i+1] < 1 and not scipy.isnan(ice_vals[y_i,x_i+1]) and not scipy.isnan(r[y_i,x_i]) and (str(y_i) + " " + str(x_i+1)) not in inputs: out_total += r[y_i,x_i]**factor; if locked[y_i,x_i-1] < 1 and not scipy.isnan(ice_vals[y_i,x_i-1]) and not scipy.isnan(l[y_i,x_i]) and (str(y_i) + " " + str(x_i-1)) not in inputs: out_total += l[y_i,x_i]**factor; if locked[y_i-1,x_i+1] < 1 and not scipy.isnan(ice_vals[y_i-1,x_i+1]) and not scipy.isnan(dr[y_i,x_i]) and (str(y_i-1) + " " + str(x_i+1)) not in inputs: out_total += dr[y_i,x_i]**factor; if locked[y_i-1,x_i-1] < 1 and not scipy.isnan(ice_vals[y_i-1,x_i-1]) and not scipy.isnan(dl[y_i,x_i]) and (str(y_i-1) + " " + str(x_i-1)) not in inputs: out_total += dl[y_i,x_i]**factor; if locked[y_i+1,x_i+1] < 1 and not scipy.isnan(ice_vals[y_i+1,x_i+1]) and not scipy.isnan(ur[y_i,x_i]) and (str(y_i+1) + " " + str(x_i+1)) not in inputs: out_total += ur[y_i,x_i]**factor; if locked[y_i+1,x_i-1] < 1 and not scipy.isnan(ice_vals[y_i+1,x_i-1]) and not scipy.isnan(ul[y_i,x_i]) and (str(y_i+1) + " " + str(x_i-1)) not in inputs: out_total += ul[y_i,x_i]**factor; if locked[y_i-1,x_i] < 1 and not scipy.isnan(ice_vals[y_i-1,x_i]) and not scipy.isnan(d[y_i,x_i]) and (str(y_i-1) + " " + str(x_i)) not in inputs: fluxes[y_i-1,x_i] += fluxes[y_i,x_i] * (d[y_i,x_i]**factor / out_total); tolock[str(y_i-1) + " " + str(x_i)] = True; if locked[y_i+1,x_i] < 1 and not scipy.isnan(ice_vals[y_i+1,x_i]) and not scipy.isnan(u[y_i,x_i]) and (str(y_i+1) + " " + str(x_i)) not in inputs: fluxes[y_i+1,x_i] += fluxes[y_i,x_i] * (u[y_i,x_i]**factor / out_total); tolock[str(y_i+1) + " " + str(x_i)] = True; if locked[y_i,x_i+1] < 1 and not scipy.isnan(ice_vals[y_i,x_i+1]) and not scipy.isnan(r[y_i,x_i]) and (str(y_i) + " " + str(x_i+1)) not in inputs: fluxes[y_i,x_i+1] += fluxes[y_i,x_i] * (r[y_i,x_i]**factor / out_total); tolock[str(y_i) + " " + str(x_i+1)] = True; if locked[y_i,x_i-1] < 1 and not scipy.isnan(ice_vals[y_i,x_i-1]) and not scipy.isnan(l[y_i,x_i]) and (str(y_i) + " " + str(x_i-1)) not in inputs: fluxes[y_i,x_i-1] += fluxes[y_i,x_i] * (l[y_i,x_i]**factor / out_total); tolock[str(y_i) + " " + str(x_i-1)] = True; if locked[y_i-1,x_i+1] < 1 and not scipy.isnan(ice_vals[y_i-1,x_i+1]) and not scipy.isnan(dr[y_i,x_i]) and (str(y_i-1) + " " + str(x_i+1)) not in inputs: fluxes[y_i-1,x_i+1] += fluxes[y_i,x_i] * (dr[y_i,x_i]**factor / out_total); tolock[str(y_i-1) + " " + str(x_i+1)] = True; if locked[y_i-1,x_i-1] < 1 and not scipy.isnan(ice_vals[y_i-1,x_i-1]) and not scipy.isnan(dl[y_i,x_i]) and (str(y_i-1) + " " + str(x_i-1)) not in inputs: fluxes[y_i-1,x_i-1] += fluxes[y_i,x_i] * (dl[y_i,x_i]**factor / out_total); tolock[str(y_i-1) + " " + str(x_i-1)] = True; if locked[y_i+1,x_i+1] < 1 and not scipy.isnan(ice_vals[y_i+1,x_i+1]) and not scipy.isnan(ur[y_i,x_i]) and (str(y_i+1) + " " + str(x_i+1)) not in inputs: fluxes[y_i+1,x_i+1] += fluxes[y_i,x_i] * (ur[y_i,x_i]**factor / out_total); tolock[str(y_i+1) + " " + str(x_i+1)] = True; if locked[y_i+1,x_i-1] < 1 and not scipy.isnan(ice_vals[y_i+1,x_i-1]) and not scipy.isnan(ul[y_i,x_i]) and (str(y_i+1) + " " + str(x_i-1)) not in inputs: fluxes[y_i+1,x_i-1] += fluxes[y_i,x_i] * (ul[y_i,x_i]**factor / out_total); tolock[str(y_i+1) + " " + str(x_i-1)] = True; """ # print(x[x_i],y[y_i]); # print(x[x_i-1],y[y_i]); # print(x[x_i+1],y[y_i]); # print(x[x_i],y[y_i-1]); # print(x[x_i],y[y_i+1]); # print(x[x_i+1],y[y_i+1]); # print(x[x_i+1],y[y_i-1]); # print(x[x_i-1],y[y_i+1]); # print(x[x_i-1],y[y_i-1]); # return; for coord in tolock: str_i, str_j = coord.split() i = int(str_i) j = int(str_j) cs2 += fluxes[i, j] thicknesses[coord] = fluxes[i, j] / speeds[i, j] locked[i, j] = 1 todo = tolock.keys() # print(cs1, cs2, cs3); cur_iteration += 1 for coord in thicknesses: i, j = coord.split() angle = angles[i, j] sub_speeds = speeds[i - 1:i + 2, j - 1:j + 2] sub_angles = angles[i - 1:i + 2, j - 1:j + 2] indices_x = [1, 2, 2, 2, 1, 0, 0, 0] indices_y = [0, 2, 0, 1, 2, 0, 2, 1] if angle >= math.pi / 4 and angle < math.pi / 2: indices_x = [2, 2, 2, 1, 0, 0, 0, 1] indices_y = [1, 2, 0, 2, 1, 0, 2, 0] elif angle >= math.pi / 2 and angle < 3 * math.pi / 4: indices_x = [2, 1, 2, 0, 0, 1, 0, 2] indices_y = [1, 2, 2, 2, 1, 0, 0, 0] elif angle >= 3 * math.pi / 4 and angle <= math.pi: indices_x = [2, 0, 1, 0, 0, 2, 1, 2] indices_y = [2, 2, 2, 1, 0, 0, 0, 1] elif angle < 0 and angle >= -1 * math.pi / 4: indices_x = [1, 2, 0, 2, 1, 0, 2, 0] indices_y = [0, 0, 0, 1, 2, 2, 2, 1] elif angle < -1 * math.pi / 4 and angle >= -1 * math.pi / 2: indices_x = [0, 2, 0, 1, 2, 0, 2, 1] indices_y = [0, 0, 1, 0, 2, 2, 1, 2] elif angle < -1 * math.pi / 2 and angle >= -3 * math.pi / 4: indices_x = [0, 1, 0, 0, 2, 1, 2, 2] indices_y = [1, 0, 2, 0, 1, 2, 0, 2] elif angle < -3 * math.pi / 4 and angle >= -1 * math.pi: indices_x = [0, 0, 1, 0, 2, 2, 1, 2] indices_y = [2, 0, 2, 1, 0, 1, 0, 2] dux_dx = dirDeriv(sub_speeds, sub_angles, indices_x, inc) dux_dy = dirDeriv(sub_speeds, sub_angles, indices_y, inc) duy_dx = dirDeriv(sub_speeds, sub_angles, indices_x, inc) duy_dy = dirDeriv(sub_speeds, sub_angles, indices_y, inc) out_str = str(x[int(j)]) + " " + str(y[int(i)]) + " " + str( thicknesses[coord]) + " " + str(fluxes[i, j]) if coord in inputs: out_str += " input" elif coord in outputs: out_str += " output" print(out_str) os.remove("one.grd") os.remove("two.grd") os.remove("three.grd") os.remove("four.grd") os.remove("five.grd") os.remove("six.grd") os.remove("seven.grd") os.remove("eight.grd") os.remove("u.grd") os.remove("d.grd") os.remove("l.grd") os.remove("r.grd") os.remove("ul.grd") os.remove("ur.grd") os.remove("dl.grd") os.remove("dr.grd") return
def dirDeriv(val_mat, direction_mat, inc): import scipy import scipy.linalg import math cell_angles = scipy.array( [[3 * math.pi / 4, math.pi / 2, math.pi / 4], [math.pi, scipy.nan, 0], [-3 * math.pi / 4, -1 * math.pi / 2, -1 * math.pi / 4]]) cell_incs = scipy.array([[(inc**2 + inc**2)**0.5, inc, (inc**2 + inc**2)**0.5], [inc, scipy.nan, inc], [(inc**2 + inc**2)**0.5, inc, (inc**2 + inc**2)**0.5]]) angle = direction_mat[1, 1] vals_x = scipy.cos(angle - direction_mat) * val_mat vals_y = scipy.sin(angle - direction_mat) * val_mat cell_cosines_f = scipy.cos(angle - cell_angles) cell_cosines_b = scipy.cos(angle - cell_angles) cell_sines_f = scipy.sin(angle - cell_angles) cell_sines_b = scipy.sin(angle - cell_angles) cell_cosines_f[cell_cosines_f < 0.00001] = scipy.nan cell_cosines_f = cell_cosines_f**2 cell_cosines_f = cell_cosines_f / sum( cell_cosines_f[~scipy.isnan(cell_cosines_f)]) cell_cosines_b[cell_cosines_b > -0.00001] = scipy.nan cell_cosines_b = cell_cosines_b**2 cell_cosines_b = cell_cosines_b / sum( cell_cosines_b[~scipy.isnan(cell_cosines_b)]) cell_sines_f[cell_sines_f < 0.00001] = scipy.nan cell_sines_f = cell_sines_f**2 cell_sines_f = cell_sines_f / sum(cell_sines_f[~scipy.isnan(cell_sines_f)]) cell_sines_b[cell_sines_b > -0.00001] = scipy.nan cell_sines_b = cell_sines_b**2 cell_sines_b = cell_sines_b / sum(cell_sines_b[~scipy.isnan(cell_sines_b)]) temp = vals_x * cell_cosines_f ux_x_f = sum(temp[~scipy.isnan(temp)]) temp = vals_x * cell_cosines_b ux_x_b = sum(temp[~scipy.isnan(temp)]) temp = vals_x * cell_sines_f ux_y_f = sum(temp[~scipy.isnan(temp)]) temp = vals_x * cell_sines_b ux_y_b = sum(temp[~scipy.isnan(temp)]) temp = vals_y * cell_cosines_f uy_x_f = sum(temp[~scipy.isnan(temp)]) temp = vals_y * cell_cosines_b uy_x_b = sum(temp[~scipy.isnan(temp)]) temp = vals_y * cell_sines_f uy_y_f = sum(temp[~scipy.isnan(temp)]) temp = vals_y * cell_sines_b uy_y_b = sum(temp[~scipy.isnan(temp)]) ux_x = scipy.array([ux_x_b, val_mat[1, 1], ux_x_f]) ux_y = scipy.array([ux_y_b, val_mat[1, 1], ux_y_f]) uy_x = scipy.array([uy_x_b, 0, uy_x_f]) uy_y = scipy.array([uy_y_b, 0, uy_y_f]) xs = scipy.array([-1 * int(inc), 0, int(inc)]) A = scipy.vstack([xs, scipy.ones(len(xs))]).T dux_dx, intercept = scipy.linalg.lstsq(A, ux_x)[0] dux_dy, intercept = scipy.linalg.lstsq(A, ux_y)[0] duy_dx, intercept = scipy.linalg.lstsq(A, uy_x)[0] duy_dy, intercept = scipy.linalg.lstsq(A, uy_y)[0] return dux_dx, dux_dy, duy_dx, duy_dy
def formatdata(data): x = data[:,0] y = data[:,1] x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] return x,y
def plotStrip(bp, at, ps, crop): """ Plot a strip of colored polygons along a trace of GPS coordinates (deg). Extension of the strip outward from the line to either side is specified by ps.sideRange, in meters. Parameters ---------- bp.polyList: master list of polygons, all lines included bp.colorList: master list of colors for each polygon bp.lineList: master list of survey lines at.fix : float (deg), (pktCount)x2 array [longitude, latitude] coordinates of ship, rows are packets in order. at.depth : float (m), array length pktCount Water depth beneath the ship at each fix. Used with extended lead-in length of cable to estimate sensor position by layback calculation. at.leadin : float (m) at.color : float (m), array length pktCount List of numbers for each position indicating the color to plot representing IP data results. """ # Start by transforming the fix points into a local azimuthal equidistant # reference system. Units along x and y are meters. ptList = [point(tuple(row)) for row in at.fix] dfPt = gpd.GeoDataFrame({'geometry': ptList}) # Assign the WGS84 latitude-longitude Coordinate Reference System (CRS). dfPt.crs = ps.crsWGS84 # Transform to the azimuthal equidistant reference. dfPt = dfPt.to_crs(ps.crsAzEq) # Extract the transformed coordinates into an array. flatFix = sp.zeros_like(at.fix, dtype=float) for p in range(len(flatFix)): flatFix[p, :] = sp.array(dfPt.geometry[p].coords) # (m) # Track vectors between each pair of consecutive GPS fixes. vParSeg = flatFix[1:, :] - flatFix[0:-1, :] # Length of each trach vector. segLen = sp.sqrt(vParSeg[:, 0]**2 + vParSeg[:, 1]**2) # (m) # Cumulative sum along the track line. sumLen = sp.hstack((0, sp.cumsum(segLen))) # Print the total line length (m). # print('%.1f m along line.' % (sumLen[-1])) # Distance between start and endpoints. startFinDist = mm.norm(flatFix[0, :] - flatFix[-1, :]) # print('%.1f m distance from start point to finish point.' % startFinDist) # Time elapsed on the line. lineTime = (at.cpuDT[-1] - at.cpuDT[0]).total_seconds() # print('%.0f s elapsed.' % lineTime) lineSpeed = startFinDist / lineTime # (m/s) lineSpeed *= 1.94384 # (kt) # print('%.1f kt average speed' % lineSpeed) # Interpolate a laidback fix location on the track line. # Layback the extra length at the start of the line according to # the boat's heading for the first few meters twice the length of # the cable lead in. newFix = sp.zeros_like(flatFix, dtype=float) linLoc = 2 * at.leadin closeIdx = sp.argmin(abs(sumLen - linLoc)) # If the line is at least as long as twice the lead in. if sumLen[-1] > linLoc: if linLoc >= sumLen[closeIdx]: idx1 = closeIdx idx2 = closeIdx + 1 else: idx1 = closeIdx - 1 idx2 = closeIdx l1 = sumLen[idx1] l2 = sumLen[idx2] startHeadingFix = flatFix[idx1, :] + ( flatFix[idx2, :] - flatFix[idx1, :]) * (linLoc - l1) / (l2 - l1) else: # Else just use the heading of the whole line. startHeadingFix = flatFix[-1, :] startHeadingVec = mm.unit(startHeadingFix - flatFix[0, :]) for p in range(len(flatFix)): linLoc = sumLen[p] - mm.cableRange(at.leadin, at.depth[p]) if linLoc >= 0: closeIdx = sp.argmin(abs(sumLen - linLoc)) if linLoc >= sumLen[closeIdx]: idx1 = closeIdx idx2 = closeIdx + 1 else: idx1 = closeIdx - 1 idx2 = closeIdx l1 = sumLen[idx1] l2 = sumLen[idx2] if l1 != l2: newFix[p, :] = flatFix[idx1, :] + (flatFix[idx2, :] - flatFix[ idx1, :]) * (linLoc - l1) / (l2 - l1) else: # Case of interpolation between two repeated locations. newFix[p, :] = flatFix[idx1, :] else: newFix[p, :] = flatFix[0, :] + linLoc * startHeadingVec # Overwrite. flatFix = newFix # Reevaluate track vectors between each pair of consecutive GPS fixes. vParSeg = flatFix[1:, :] - flatFix[0:-1, :] # Track vectors at each point, found from points before and after. vParPt = flatFix[2:, :] - flatFix[0:-2, :] # Include segment parallels for the boundary fix points. vParPt = sp.vstack((vParSeg[0, :], vParPt, vParSeg[-1, :])) # Midpoints along the sequence of GPS fixes. midPts = (flatFix[1:, :] + flatFix[0:-1, :]) / 2 # Perpendicular vectors at each segment and fix point. # Vector lengths are set to sideRange. vPerpSeg = ps.sideRange * mm.unit(mm.perp(vParSeg)) # (m) vPerpPt = ps.sideRange * mm.unit(mm.perp(vParPt)) # (m) # If cropping, only include fix points where asked. plottedPkts = sp.array(range(len(at.pkt))) if crop and ps.plotThis != 'crop': plottedPkts = plottedPkts[at.cropLogic] lastGoodVerts = sp.zeros((4, 2)) # Polygon patches for each packet. for p in plottedPkts: # Perpendicular displacement, length sideRange, at the first midpoint. if p != 0: # Identify a trailing midpoint which is different from the # present fix location. (Not between duplicate fixes.) pPrior = p - 1 while pPrior >= 0 and all(midPts[pPrior, :] == flatFix[p, :]): pPrior -= 1 vert01 = sp.vstack((midPts[pPrior, :] - vPerpSeg[pPrior, :], midPts[pPrior, :] + vPerpSeg[pPrior, :])) else: vert01 = sp.zeros((0, 2)) # Polygon points offset from the flat fix points themselves. vert2 = flatFix[p, :] + vPerpPt[p, :] vert5 = flatFix[p, :] - vPerpPt[p, :] if p != len(flatFix) - 1: # at the second midpoint. vert34 = sp.vstack( (midPts[p, :] + vPerpSeg[p, :], midPts[p, :] - vPerpSeg[p, :])) else: vert34 = sp.zeros((0, 2)) # Polygon vertices. verts = sp.vstack((vert01, vert2, vert34, vert5)) # In the case where IP packets come in at a higher rate than the GPS # fixes are updated, consecutive packets have the same position at # times. In this case, reuse the last useable polygon. This will plot # on top of the reused position. if sp.isnan(verts).any(): verts = lastGoodVerts.copy() else: lastGoodVerts = verts.copy() # Vertices as tuples in a list. vertList = [tuple(row) for row in verts] # Append the latest polygon vertices to the list of polygons. bp.polyList.append(polygon(vertList)) bp.colorList = sp.hstack((bp.colorList, at.color[plottedPkts])) # Include each segment between the fix coordinates as its own line object. for p in plottedPkts: if p < len(flatFix) - 1: endPts = [tuple(row) for row in flatFix[p:p + 2, :]] if at.xmitFund == 8: bp.lineList.append(lineStr(endPts)) if ps.saveTxt: # Pseudocolor plots. txtName = 'ch%d_H%d_%s_%s_%d.txt' % ( ps.ch, ps.h, ps.plotThis, at.fileDateStr, at.fileNum, ) txtPath = os.path.join(ps.folderPath, 'plotData', ps.plotThis, txtName) with open(txtPath, 'w') as f: for p in range(at.pktCount): # longi (deg), lat (deg), color (?) wStr = (str(dfPt.geometry[p].x) + ',' + str(dfPt.geometry[p].y) + ',' + str(at.color[p]) + '\n') f.write(wStr)
def parse_cegs_drosophila_phenotypes( phenotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/allphenotypes_5.0_cleaned.tab.reps.hdf5', ): """ Parser for CEGS Drosophila phenotype data """ import pylab #Load phenotypes... ph5f = h5py.File(phenotype_file) #Now take the median and mean of all values for all individuals. phen_dict = {} for phen in ph5f.keys(): #First mated Y_mated = ph5f[phen]['Y_mated'][...] Z_mated = ph5f[phen]['Z_mated'][...] sample_filter = sp.negative(sp.isnan(Y_mated)) Ys_sum = sp.dot(Y_mated[sample_filter], Z_mated[sample_filter]) rep_count = sp.dot(sp.ones(sum(sample_filter)), Z_mated[sample_filter]) Y_means = Ys_sum / rep_count #Now calculate medians by iteration. phen_vals_list = [[] for i in range(216)] for i in range(len(Y_mated)): ind_i = sp.where(1 == Z_mated[i])[0][0] phen_vals_list[ind_i].append(Y_mated[i]) medians = sp.zeros(216) for i, pl in enumerate(phen_vals_list): if len(pl) > 0: medians[i] = sp.median(pl) else: medians[i] = sp.nan ind_filter = sp.negative(sp.isnan(Y_means)) if phen == 'Triglyceride': ind_filter = (Y_means > 0) * ind_filter phen_dict[phen] = { 'mated': { 'Y_means': Y_means, 'rep_count': rep_count, 'ind_filter': ind_filter, 'Y_medians': medians } } print 'Plotting phenotype histograms for %s, %s' % (phen, 'mated') mated_filtered_means = Y_means[ind_filter] pylab.hist(mated_filtered_means) pylab.savefig( '/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_mated_means.png' % (phen)) pylab.clf() mated_filtered_medians = medians[ind_filter] pylab.hist(mated_filtered_medians) pylab.savefig( '/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_mated_medians.png' % (phen)) pylab.clf() #Then virgin Y_virgin = ph5f[phen]['Y_virgin'][...] Z_virgin = ph5f[phen]['Z_virgin'][...] sample_filter = sp.negative(sp.isnan(Y_virgin)) Ys_sum = sp.dot(Y_virgin[sample_filter], Z_virgin[sample_filter]) rep_count = sp.dot(sp.ones(sum(sample_filter)), Z_virgin[sample_filter]) Y_means = Ys_sum / rep_count #Now calculate medians by iteration. phen_vals_list = [[] for i in range(216)] for i in range(len(Y_virgin)): ind_i = sp.where(1 == Z_virgin[i])[0][0] phen_vals_list[ind_i].append(Y_virgin[i]) medians = sp.zeros(216) for i, pl in enumerate(phen_vals_list): if len(pl) > 0: medians[i] = sp.median(pl) else: medians[i] = sp.nan ind_filter = sp.negative(sp.isnan(Y_means)) if phen == 'Triglyceride': ind_filter = (Y_means > 0) * ind_filter phen_dict[phen]['virgin'] = { 'Y_means': Y_means, 'rep_count': rep_count, 'ind_filter': ind_filter, 'Y_medians': medians } print 'Plotting phenotype histograms for %s, %s' % (phen, 'virgin') virgin_filtered_means = Y_means[ind_filter] pylab.hist(virgin_filtered_means) pylab.savefig( '/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_virgin_means.png' % (phen)) pylab.clf() virgin_filtered_medians = medians[ind_filter] pylab.hist(virgin_filtered_medians) pylab.savefig( '/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_virgin_medians.png' % (phen)) pylab.clf() means_corr = sp.corrcoef(mated_filtered_means, virgin_filtered_means)[0, 1] medians_corr = sp.corrcoef(mated_filtered_medians, virgin_filtered_medians)[0, 1] print 'Correlation between mated and virgin flies, means: %0.2f, medians: %0.2f' % ( means_corr, medians_corr) phen_dict[phen]['corrs'] = { 'means': means_corr, 'medians': medians_corr } return phen_dict
def coordinate_cegs_genotype_phenotype( phen_dict, phenotype='Protein', env='mated', k_thres=0.8, ind_missing_thres=0.5, snp_missing_thres=0.05, maf_thres=0.1, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.Filter_imputed.hdf5' ): """ Parse genotypes and coordinate with phenotype, and ready data for analysis. """ gh5f = h5py.File(genotype_file) p_dict = phen_dict[phenotype][env] print 'Loading SNPs' snps = sp.array(gh5f['gt'][...], dtype='single') snps = snps[:, p_dict['ind_filter']] positions = gh5f['pos'][...] m, n = snps.shape print 'Loaded %d SNPs for %d individuals' % (m, n) print 'Filtering individuals with missing rates >%0.2f' % ind_missing_thres missing_mat = sp.isnan(snps) ind_missing_rates = sp.sum(missing_mat, 0) / float(m) ind_filter = ind_missing_rates < ind_missing_thres snps = snps[:, ind_filter] n = sp.sum(ind_filter) print 'Filtered %d individuals due to high missing rates' % sp.sum( sp.negative(ind_filter)) gt_ids = gh5f['gt_ids'][p_dict['ind_filter']] gt_ids = gt_ids[ind_filter] Y_means = p_dict['Y_means'][p_dict['ind_filter']] Y_means = Y_means[ind_filter] Y_medians = p_dict['Y_medians'][p_dict['ind_filter']] Y_medians = Y_medians[ind_filter] rep_count = p_dict['rep_count'][p_dict['ind_filter']] rep_count = rep_count[ind_filter] print 'Now removing "bad" genotypes.' bad_genotypes = [ 'Raleigh_272', 'Raleigh_378', 'Raleigh_554', 'Raleigh_591', 'Raleigh_398', 'Raleigh_138', 'Raleigh_208', 'Raleigh_336', 'Raleigh_370', 'Raleigh_373', 'Raleigh_374', 'Raleigh_799', 'Raleigh_821', 'Raleigh_822', 'Raleigh_884', 'Raleigh_335' ] ind_filter = sp.negative(sp.in1d(gt_ids, bad_genotypes)) gt_ids = gt_ids[ind_filter] Y_means = Y_means[ind_filter] Y_medians = Y_medians[ind_filter] rep_count = rep_count[ind_filter] snps = snps[:, ind_filter] print 'Removed %d "bad" genotypes' % sp.sum(sp.negative(ind_filter)) n = len(snps[0]) print 'Filtering SNPs with missing rate >%0.2f' % snp_missing_thres missing_mat = sp.isnan(snps) snp_missing_rates = sp.sum(missing_mat, 1) / float(n) snps_filter = snp_missing_rates < snp_missing_thres snps = snps[snps_filter] positions = positions[snps_filter] m = sp.sum(snps_filter) print 'Filtered %d SNPs due to high missing rate' % sp.sum( sp.negative(snps_filter)) print 'Now imputing (w mean)' missing_mat = sp.isnan(snps) ok_counts = n - sp.sum(missing_mat, 1) snps[missing_mat] = 0 snp_means = sp.sum(snps, 1) / ok_counts # print snp_means.shape # print snp_means[:10] # import pdb # pdb.set_trace() for i in range(len(snps)): snps[i, missing_mat[i]] = snp_means[i] print 'And filtering SNPs with MAF<%0.2f' % maf_thres snp_means = sp.mean(snps, 1) snp_mafs = sp.minimum(snp_means, 1 - snp_means) snps_filter = snp_mafs > maf_thres snps = snps[snps_filter] positions = positions[snps_filter] print 'Filtered %d SNPs with low MAFs' % sp.sum(sp.negative(snps_filter)) print 'Filtering based on kinship w threshold:', k_thres import kinship K = kinship.calc_ibd_kinship(snps) print '\nKinship calculated' K_ind_filter = [] for i in range(n): K_ind_filter.append(not sp.any(K[i, i + 1:n] > k_thres)) if sum(K_ind_filter) == n: print 'No individuals were filtered based on kinship..' else: print 'Filtering %d individuals based on kinship.' % ( n - sum(K_ind_filter)) K_ind_filter = sp.array(K_ind_filter) gt_ids = gt_ids[K_ind_filter] Y_means = Y_means[K_ind_filter] Y_medians = Y_medians[K_ind_filter] rep_count = rep_count[K_ind_filter] snps = snps[:, K_ind_filter] print 'Again filtering SNPs with MAF<%0.2f' % maf_thres snp_means = sp.mean(snps, 1) snp_mafs = sp.minimum(snp_means, 1 - snp_means) snps_filter = snp_mafs > maf_thres snps = snps[snps_filter] positions = positions[snps_filter] print 'Filtered %d additional SNPs with low MAFs' % sp.sum( sp.negative(snps_filter)) print 'All filtering done.' m, n = snps.shape print 'In all there are %d SNPs remaining, for %d individuals.' % (m, n) ret_dict = { 'Y_means': Y_means, 'Y_medians': Y_medians, 'rep_count': rep_count, 'gt_ids': gt_ids, 'positions': positions, 'snps': snps } return ret_dict