def _input_checks(self): # data_df validation if self.data_df is None: raise ControlledError( " The Predictive Info class requires pandas dataframe as input dataframe. Entered data_df was 'None'." ) elif self.data_df is not None: check( isinstance(self.data_df, pd.DataFrame), 'type(data_df) = %s; must be a pandas dataframe ' % type(self.data_df)) # validate data_df check( pd.DataFrame.equals(self.data_df, qc.validate_dataset(self.data_df)), " Input dataframe fails quality control, please ensure input dataframe has the correct format of an mpathic dataframe " ) # model validation if self.model_df is None: raise ControlledError( " The Predictive info class requires pandas dataframe as input model dataframe. Entered model_df was 'None'." ) elif self.model_df is not None: check( isinstance(self.model_df, pd.DataFrame), 'type(model_df) = %s; must be a pandas dataframe ' % type(self.model_df)) # validate model df check( pd.DataFrame.equals(self.model_df, qc.validate_model(self.model_df)), " Model dataframe failed quality control, \ please ensure input model dataframe has the correct format of an mpathic dataframe " ) # check that start is an integer check(isinstance(self.start, int), 'type(start) = %s; must be of type int ' % type(self.start)) check(self.start >= 0, "start = %d must be a positive integer " % self.start) if self.end is not None: check(isinstance(self.end, int), 'type(end) = %s; must be of type int ' % type(self.end)) # check that verbose is a boolean check(isinstance(self.err, bool), 'type(err) = %s; must be of type bool ' % type(self.err)) check( isinstance(self.coarse_graining_level, int), 'type(coarse_graining_level) = %s; must be of type int ' % type(self.coarse_graining_level))
def coeffs_to_field(coeffs, kernel): """ For maxent algorithm. """ # Get number of gridpoints and dimension of kernel G = kernel.shape[0] kernel_dim = kernel.shape[1] # Make sure coeffs is valid if not (len(coeffs) == kernel_dim): raise ControlledError( '/coeffs_to_field/ coeffs must have length %d: len(coeffs) = %d' % (kernel_dim, len(coeffs))) if not all(np.isreal(coeffs)): raise ControlledError( '/coeffs_to_field/ coeffs is not real: coeffs = %s' % coeffs) if not all(np.isfinite(coeffs)): raise ControlledError( '/coeffs_to_field/ coeffs is not finite: coeffs = %s' % coeffs) # Convert to matrices kernel_mat = sp.mat(kernel) # G x kernel_dim matrix coeffs_col = sp.mat(coeffs).T # kernel_dim x 1 matrix field_col = kernel_mat * coeffs_col # G x 1 matrix return sp.array(field_col).ravel() # Returns an array
def _input_check(self): # check that dataset_df is valid if self.dataset_df is None: raise ControlledError( " Profile info requires pandas dataframe as input dataframe. Entered df was 'None'." ) elif self.dataset_df is not None: check( isinstance(self.dataset_df, pd.DataFrame), 'type(df) = %s; must be a pandas dataframe ' % type(self.dataset_df)) # validate dataset check( pd.DataFrame.equals(self.dataset_df, qc.validate_dataset(self.dataset_df)), " Input dataframe failed quality control, \ please ensure input dataset has the correct format of an mpathic dataframe " ) # check that attribute err is of type boolean check(isinstance(self.err, bool), 'type(err) = %s; must be a boolean ' % type(self.err)) # method method is string ... check(isinstance(self.method, str), 'type(method) = %s; must be a string ' % type(self.method)) # ... and string value is valid valid_method_choices = ['naive', 'tpm', 'nsb'] check( self.method in valid_method_choices, 'method = %s; must be in %s' % (self.method, valid_method_choices)) # check pseudocount is float check( isinstance(self.pseudocount, float), 'type(pseudocount) = %s; must be a float ' % type(self.pseudocount)) check(isinstance(self.start, int), 'type(start) = %s; must be of type int ' % type(self.start)) check(self.start >= 0, "start = %d must be a positive integer " % self.start) if self.end is not None: check(isinstance(self.end, int), 'type(end) = %s; must be of type int ' % type(self.end))
def hessian_per_datum_from_coeffs(coeffs, R, kernel, phi0=False, regularized=False): """ For optimizer. Computes hessian from coefficients. """ # Get number of gridpoints and dimension of kernel G = kernel.shape[0] kernel_dim = kernel.shape[1] # Make sure coeffs is valid if not (len(coeffs) == kernel_dim): raise ControlledError( '/hessian_per_datum_from_coeffs/ coeffs must have length %d: len(coeffs) = %d' % (kernel_dim, len(coeffs))) if not all(np.isreal(coeffs)): raise ControlledError( '/hessian_per_datum_from_coeffs/ coeffs is not real: coeffs = %s' % coeffs) if not all(np.isfinite(coeffs)): raise ControlledError( '/hessian_per_datum_from_coeffs/ coeffs is not finite: coeffs = %s' % coeffs) # Make sure phi0 is valid if not isinstance(phi0, np.ndarray): phi0 = np.zeros(G) else: if not all(np.isreal(phi0)): raise ControlledError( '/hessian_per_datum_from_coeffs/ phi0 is not real: phi0 = %s' % phi0) if not all(np.isfinite(phi0)): raise ControlledError( '/hessian_per_datum_from_coeffs/ phi0 is not finite: phi0 = %s' % phi0) # Make sure regularized is valid if not isinstance(regularized, bool): raise ControlledError( '/hessian_per_datum_from_coeffs/ regularized must be a boolean: regularized = %s' % type(regularized)) phi = coeffs_to_field(coeffs, kernel) quasiQ = utils.field_to_quasiprob(phi + phi0) kernel_mat = sp.mat(kernel) # G x kernel_dim H = sp.mat(sp.diag(quasiQ)) # G x G if regularized: H += (1. / G) * sp.diag(np.ones(G)) / (PHI_STD_REG**2) hessian_mat = kernel_mat.T * H * kernel_mat # kernel_dim x kernel_dim # Make sure hessian_array is valid ? return sp.array(hessian_mat) # Returns an array
def _input_checks(self): # check that dataset_df is valid if self.dataset_df is None: raise ControlledError( " Profile info requires pandas dataframe as input dataframe. Entered df was 'None'." ) elif self.dataset_df is not None: check( isinstance(self.dataset_df, pd.DataFrame), 'type(df) = %s; must be a pandas dataframe ' % type(self.dataset_df)) # validate dataset check( pd.DataFrame.equals(self.dataset_df, qc.validate_dataset(self.dataset_df)), " Input dataframe failed quality control, \ please ensure input dataset has the correct format of an mpathic dataframe " ) if self.bin is not None: # check bin is int check(isinstance(self.bin, int), 'type(bin) = %s; must be a int ' % type(self.bin)) check(isinstance(self.start, int), 'type(start) = %s; must be of type int ' % type(self.start)) check(self.start >= 0, "start = %d must be a positive integer " % self.start) if self.end is not None: check(isinstance(self.end, int), 'type(end) = %s; must be of type int ' % type(self.end)) # check that attribute err is of type boolean check(isinstance(self.err, bool), 'type(err) = %s; must be a boolean ' % type(self.err))
def _input_check(self): """ check input parameters for correctness """ if self.dataset_df is None: raise ControlledError( " Profile freq requires pandas dataframe as input dataframe. Entered df was 'None'." ) elif self.dataset_df is not None: check( isinstance(self.dataset_df, pd.DataFrame), 'type(df) = %s; must be a pandas dataframe ' % type(self.dataset_df)) # validate dataset check( pd.DataFrame.equals(self.dataset_df, qc.validate_dataset(self.dataset_df)), " Input dataframe failed quality control, \ please ensure input dataset has the correct format of an mpathic dataframe " ) if self.bin is not None: check(isinstance(self.bin, int), 'type(bin) = %s; must be of type int ' % type(self.bin)) check(self.bin > 0, 'bin = %d must be a positive int ' % self.bin) check(isinstance(self.start, int), 'type(start) = %s; must be of type int ' % type(self.start)) check(self.start >= 0, "start = %d must be a positive integer " % self.start) if self.end is not None: check(isinstance(self.end, int), 'type(end) = %s; must be of type int ' % type(self.end))
def action_per_datum_from_coeffs(coeffs, R, kernel, phi0=False, regularized=False): """ For optimizer. Computes action from coefficients. """ # Get number of gridpoints and dimension of kernel G = kernel.shape[0] kernel_dim = kernel.shape[1] # Make sure coeffs is valid if not (len(coeffs) == kernel_dim): raise ControlledError( '/action_per_datum_from_coeffs/ coeffs must have length %d: len(coeffs) = %d' % (kernel_dim, len(coeffs))) if not all(np.isreal(coeffs)): raise ControlledError( '/action_per_datum_from_coeffs/ coeffs is not real: coeffs = %s' % coeffs) if not all(np.isfinite(coeffs)): raise ControlledError( '/action_per_datum_from_coeffs/ coeffs is not finite: coeffs = %s' % coeffs) # Make sure phi0 is valid if not isinstance(phi0, np.ndarray): phi0 = np.zeros(G) else: if not all(np.isreal(phi0)): raise ControlledError( '/action_per_datum_from_coeffs/ phi0 is not real: phi0 = %s' % phi0) if not all(np.isfinite(phi0)): raise ControlledError( '/action_per_datum_from_coeffs/ phi0 is not finite: phi0 = %s' % phi0) # Make sure regularized is valid if not isinstance(regularized, bool): raise ControlledError( '/action_per_datum_from_coeffs/ regularized must be a boolean: regularized = %s' % type(regularized)) phi = coeffs_to_field(coeffs, kernel) quasiQ = utils.field_to_quasiprob(phi + phi0) current_term = sp.sum(R * phi) nonlinear_term = sp.sum(quasiQ) s = current_term + nonlinear_term if regularized: s += (.5 / G) * sum(phi**2) / (PHI_STD_REG**2) # Make sure s is valid if not np.isreal(s): raise ControlledError( '/action_per_datum_from_coeffs/ s is not real: s = %s' % s) if not np.isfinite(s): raise ControlledError( '/action_per_datum_from_coeffs/ s is not finite: s = %s' % s) return s
def compute_maxent_field(R, kernel, report_num_steps=False, phi0=False, geo_dist_tollerance=1E-3, grad_tollerance=1E-5): """ Computes the maxent field from a histogram and kernel Args: R (numpy.narray): Normalized histogram of the raw data. Should have size G kernel (numpy.ndarray): Array of vectors spanning the smoothness operator kernel. Should have size G x kernel_dim Returns: phi: The MaxEnt field. """ # Make sure report_num_steps is valid if not isinstance(report_num_steps, bool): raise ControlledError( '/compute_maxent_field/ report_num_steps must be a boolean: report_num_steps = %s' % type(report_num_steps)) # Make sure phi0 is valid if not isinstance(phi0, np.ndarray): phi0 = np.zeros(len(R)) else: if not all(np.isreal(phi0)): raise ControlledError( '/compute_maxent_field/ phi0 is not real: phi0 = %s' % phi0) if not all(np.isfinite(phi0)): raise ControlledError( '/compute_maxent_field/ phi0 is not finite: phi0 = %s' % phi0) # Make sure geo_dist_tollerance is valid if not isinstance(geo_dist_tollerance, float): raise ControlledError( '/compute_maxent_field/ geo_dist_tollerance must be a float: geo_dist_tollerance = %s' % type(geo_dist_tollerance)) # Make sure grad_tollerance is valid if not isinstance(grad_tollerance, float): raise ControlledError( '/compute_maxent_field/ grad_tollerance must be a float: grad_tollerance = %s' % type(grad_tollerance)) # Get number of gridpoints and dimension of kernel G = kernel.shape[0] kernel_dim = kernel.shape[1] # Set coefficients to zero if kernel_dim > 1: coeffs = sp.zeros(kernel_dim) #coeffs = sp.randn(kernel_dim) else: coeffs = sp.zeros(1) # Evaluate the probabiltiy distribution phi = coeffs_to_field(coeffs, kernel) phi = sp.array(phi).ravel() phi0 = sp.array(phi0).ravel() #print phi+phi0 Q = utils.field_to_prob(phi + phi0) # Evaluate action s = action_per_datum_from_coeffs(coeffs, R, kernel, phi0) # Perform corrector steps until phi converges num_corrector_steps = 0 num_backtracks = 0 while True: if kernel_dim == 1: success = True break # Compute the gradient v = gradient_per_datum_from_coeffs(coeffs, R, kernel, phi0) # If gradient is not detectable, we're already done! if norm(v) < G * utils.TINY_FLOAT32: break # Compute the hessian Lambda = hessian_per_datum_from_coeffs(coeffs, R, kernel, phi0) # Solve linear equation to get change in field # This is the conjugate gradient method da = -sp.real(solve(Lambda, v)) # Compute corresponding change in action ds = sp.sum(da * v) # This should always be satisifed if (ds > 0): print('Warning: ds > 0. Quitting compute_maxent_field.') break # Reduce step size until in linear regime beta = 1.0 success = False while True: # Compute new phi and new action coeffs_new = coeffs + beta * da s_new = action_per_datum_from_coeffs(coeffs_new, R, kernel, phi0) # Check for linear regime if s_new <= s + 0.5 * beta * ds: break # Check to see if beta is too small and algorithm is failing elif beta < 1E-20: raise ControlledError( '/compute_maxent_field/ phi is not converging: beta = %s' % beta) # If not in linear regime backtrack value of beta else: # pdb.set_trace() num_backtracks += 1 beta *= 0.5 # Compute new distribution phi_new = coeffs_to_field(coeffs_new, kernel) Q_new = utils.field_to_prob(phi_new + phi0) # Break out of loop if Q_new is close enough to Q if (utils.geo_dist(Q_new, Q) < geo_dist_tollerance) and (np.linalg.norm(v) < grad_tollerance): success = True break # Break out of loop with warning if S_new > S. Should not happen, # but not fatal if it does. Just means less precision elif s_new - s > 0: print('Warning: action has increased. Terminating steps.') success = False break # Otherwise, continue with corrector step else: num_corrector_steps += 1 # Set new coefficients. # New s, Q, and phi laready computed coeffs = coeffs_new s = s_new Q = Q_new phi = phi_new # Actually, should judge success by whether moments match if not success: print('gradident norm == %f' % np.linalg.norm(v)) print('gradient tollerance == %f' % grad_tollerance) print('Failure! Trying Maxent again!') # After corrector loop has finished, return field # Also return stepping stats if requested if report_num_steps: return phi, num_corrector_steps, num_backtracks else: return phi, success
def gradient_per_datum_from_coeffs(coeffs, R, kernel, phi0=False, regularized=False): """ For optimizer. Computes gradient from coefficients. """ # Get number of gridpoints and dimension of kernel G = kernel.shape[0] kernel_dim = kernel.shape[1] # Make sure coeffs is valid if not (len(coeffs) == kernel_dim): raise ControlledError( '/gradient_per_datum_from_coeffs/ coeffs must have length %d: len(coeffs) = %d' % (kernel_dim, len(coeffs))) if not all(np.isreal(coeffs)): raise ControlledError( '/gradient_per_datum_from_coeffs/ coeffs is not real: coeffs = %s' % coeffs) if not all(np.isfinite(coeffs)): raise ControlledError( '/gradient_per_datum_from_coeffs/ coeffs is not finite: coeffs = %s' % coeffs) # Make sure phi0 is valid if not isinstance(phi0, np.ndarray): phi0 = np.zeros(G) else: if not all(np.isreal(phi0)): raise ControlledError( '/gradient_per_datum_from_coeffs/ phi0 is not real: phi0 = %s' % phi0) if not all(np.isfinite(phi0)): raise ControlledError( '/gradient_per_datum_from_coeffs/ phi0 is not finite: phi0 = %s' % phi0) # Make sure regularized is valid if not isinstance(regularized, bool): raise ControlledError( '/gradient_per_datum_from_coeffs/ regularized must be a boolean: regularized = %s' % type(regularized)) phi = coeffs_to_field(coeffs, kernel) quasiQ = utils.field_to_quasiprob(phi + phi0) R_row = sp.mat(R) # 1 x G quasiQ_row = sp.mat(quasiQ) # 1 x G kernel_mat = sp.mat(kernel) # G x kernel_dim mu_R_row = R_row * kernel_mat # 1 x kernel_dim mu_quasiQ_row = quasiQ_row * kernel_mat # 1 x kernel_dim grad_row = mu_R_row - mu_quasiQ_row # 1 x kernel_dim if regularized: reg_row = (1. / G) * sp.mat(phi) / (PHI_STD_REG**2) # 1 x G mu_reg_row = reg_row * kernel_mat # 1 x kernel_dim grad_row += mu_reg_row # 1 x kernel_dim # Make sure grad_array is valid grad_array = sp.array(grad_row).ravel() if not all(np.isreal(grad_array)): raise ControlledError( '/gradient_per_datum_from_coeffs/ grad_array is not real: grad_array = %s' % grad_array) if not all(np.isfinite(grad_array)): raise ControlledError( '/gradient_per_datum_from_coeffs/ grad_array is not finite: grad_array = %s' % grad_array) return sp.array(grad_row).ravel() # Returns an array
def _input_check(self): """ private method that validates all parameters """ # check that input df is of type pandas dataframe if self.df is None: raise ControlledError( " Simulate Sort Requires pandas dataframe as input dataframe. Entered df was 'None'." ) elif self.df is not None: check(isinstance(self.df, pd.DataFrame), 'type(df) = %s; must be a pandas dataframe ' % type(self.df)) # validate dataset check( pd.DataFrame.equals(self.df, qc.validate_dataset(self.df)), " Input dataframe failed quality control, \ please ensure input dataset has the correct format of an mpathic dataframe " ) # check model dataframe if self.mp is None: raise ControlledError( " Simulate Sort Requires pandas dataframe as model input. Entered model df was 'None'." ) elif self.mp is not None: check(isinstance(self.mp, pd.DataFrame), 'type(mp) = %s; must be a pandas dataframe ' % type(self.mp)) # validate dataset check( pd.DataFrame.equals(self.mp, qc.validate_model(self.mp)), " Model dataframe failed quality control, \ please ensure model has the correct format of an mpathic model dataframe " ) # check noisetype is string check(isinstance(self.noisetype, str), 'type(noisetype) = %s; must be a string ' % type(self.noisetype)) # check noisetype is valid valid_noisetype_values = ['LogNormal', 'Normal', 'None', 'Plasmid'] check( self.noisetype in valid_noisetype_values, 'noisetype = %s; must be in %s' % (self.noisetype, valid_noisetype_values)) # ensure that npar is type list check(isinstance(self.npar, list), 'type(npar) = %s; must be a list ' % type(self.npar)) # for valid choice of noisetype, pick appropriate noise parameters if self.noisetype == 'Normal': if len(self.npar) != 1: raise SortSeqError( 'For a normal noise model, there must be one input parameter (width of normal distribution)' ) if self.noisetype == 'LogNormal': if len(self.npar) != 2: raise SortSeqError('''For a LogNormal noise model there must be 2 input parameters''') # ensure nbins is valid check(isinstance(self.nbins, int), 'type(nbins) = %s; must be of type int ' % type(self.nbins)) check( self.nbins > 1, 'number of bins must be greater than 1, entered bins = %d' % self.nbins) # sequence library should be boolean check( isinstance(self.sequence_library, bool), 'type(sequence_library) = %s; must be of type bool ' % type(self.sequence_library)) # make sure start is of type int check(isinstance(self.start, int), 'type(start) = %s; must be of type int ' % type(self.start)) # make sure end is of type int if self.end is not None: check(isinstance(self.end, int), 'type(end) = %s; must be of type int ' % type(self.end)) # make sure end is of type int if self.chunksize is not None: check( isinstance(self.chunksize, int), 'type(chunksize) = %s; must be of type int ' % type(self.chunksize))
def _input_checks(self): """ check input parameters for correctness """ # dataset if self.df is None: raise ControlledError( " The Learn Model class requires pandas dataframe as input dataframe. Entered df was 'None'." ) elif self.df is not None: check(isinstance(self.df, pd.DataFrame), 'type(df) = %s; must be a pandas dataframe ' % type(self.df)) # validate dataset check( pd.DataFrame.equals(self.df, qc.validate_dataset(self.df)), " Input dataframe failed quality control, \ please ensure input dataset has the correct format of an mpathic dataframe " ) # check lm is of type string check(isinstance(self.lm, str), "type(lm) = %s must be a string " % type(self.lm)) # check lm value is valid valid_lm_values = ['ER', 'LS', 'IM', 'PR'] check(self.lm in valid_lm_values, 'lm = %s; must be in %s' % (self.lm, valid_lm_values)) # check that model type is of type string check(isinstance(self.modeltype, str), "type(modeltype) = %s must be a string " % type(self.modeltype)) # check that modeltype value is valid valid_modeltype_values = ['MAT', 'NBR'] check( self.modeltype in valid_modeltype_values, 'modeltype = %s; must be in %s' % (self.modeltype, valid_modeltype_values)) # validate LS_mean_std LS_means_std_valid_col_order = ['bin', 'mean', 'std'] if self.LS_means_std is not None: check( pd.DataFrame.equals(self.LS_means_std, qc.validate_meanstd(self.LS_means_std)), " LS_means_std failed quality control, \ please ensure input dataset has the correct format for LS_means_std: %s" % LS_means_std_valid_col_order) if self.db is not None: # check that db is a string check(isinstance(self.db, str), "type(db) = %s must be a string " % type(self.db)) # check that iteration is an integer check( isinstance(self.iteration, int), 'type(iteration) = %s; must be of type int ' % type(self.iteration)) # check that burnin is an integer check(isinstance(self.burnin, int), 'type(burnin) = %s; must be of type int ' % type(self.burnin)) # check that thin is an integer check(isinstance(self.thin, int), 'type(thin) = %s; must be of type int ' % type(self.thin)) # check that runnum is an integer check(isinstance(self.runnum, int), 'type(runnum) = %s; must be of type int ' % type(self.runnum)) # check that initialize is a string and it's value is valid check( isinstance(self.initialize, str), "type(initialize) = %s must be a string " % type(self.initialize)) valid_initialize_values = ['rand', 'LS', 'PR'] check( self.initialize in valid_initialize_values, 'initialize = %s; must be in %s' % (self.initialize, valid_initialize_values)) # check that start is an integer check(isinstance(self.start, int), 'type(start) = %s; must be of type int ' % type(self.start)) check(self.start >= 0, "start = %d must be a positive integer " % self.start) if self.end is not None: check(isinstance(self.end, int), 'type(end) = %s; must be of type int ' % type(self.end)) # check that foreground is an integer check( isinstance(self.foreground, int), 'type(foreground) = %s; must be of type int ' % type(self.foreground)) # check that background is an integer check( isinstance(self.background, int), 'type(background) = %s; must be of type int ' % type(self.background)) # check that alpha is a float check( isinstance(self.alpha, float), 'type(alpha) = %s; must be of type float ' % type(self.background)) # check that pseudocounts is an integer check( isinstance(self.pseudocounts, int), 'type(pseudocounts) = %s; must be of type int ' % type(self.pseudocounts)) # check that verbose is a boolean check(isinstance(self.verbose, bool), 'type(verbose) = %s; must be of type bool ' % type(self.verbose)) if self.tm is not None: # check that tm is an integer check(isinstance(self.tm, int), 'type(tm) = %s; must be of type int ' % type(self.tm))