def run(cfg, comm=None): ''' Coordinate parallel MCMC and output based upon process rank. Parameters ---------- - cfg : dictionary Configuration dictionary containing priors, settings, and paths for analysis. Its format is specified in detail in separate documentation. - comm : mpi4py.MPI.COMM Initialized MPI communicator. If None, it will be set to MPI.COMM_WORLD. ''' if comm is None: # Start MPI communications if no comm provided comm = MPI.COMM_WORLD # Get process information rank = comm.Get_rank() n_proc = comm.Get_size() # Load data data = load_data(cfg=cfg, n_workers=n_proc - 1, rank=rank) if rank == MPIROOT: # Run estimation draws, accept_stats, mapping_peptides = master(comm=comm, data=data, cfg=cfg) # Construct path for master results path_master = cfg['output']['path_results_master'] # Write master results to compressed file lib.write_to_hdf5(fname=path_master, compress=cfg['output']['compress'], draws=draws, accept_stats=accept_stats, mapping_peptides=mapping_peptides) else: result_worker = worker(comm=comm, rank=rank, data=data, cfg=cfg) draws, mapping_peptides = result_worker[:2] proteins_worker, peptides_worker = result_worker[2:] # Construct path for worker-specific results path_worker = cfg['output']['pattern_results_worker'] % rank # Write worker-specific results to compressed file lib.write_to_hdf5(fname=path_worker, compress=cfg['output']['compress'], draws=draws, mapping_peptides=mapping_peptides, proteins_worker=proteins_worker, peptides_worker=peptides_worker)
def worker(comm, rank, data, cfg): ''' Worker-node process for parallel MCMC sampler. Receives parameters and commands from master node. Runs local updates and distributed components of shared draws. Parameters ---------- - comm : mpi4py.MPI.COMM Initialized MPI communicator. - rank : int Rank (>= MPIROOT) of worker. - data : dictionary Data as output from load_data with rank > 0. - init : dictionary Initial parameter values as output from initialize. - cfg : dictionary Configuration dictionary containing priors, settings, and paths for analysis. Its format is specified in detail in separate documentation. Returns ------- - draws : dictionary 1- and 2-dimensional ndarrays containing the posterior samples for each protein- and ppeptide-specific parameter. Shared parameters are handled by the master process. - mapping_peptides : integer ndarray Worker-specific peptide to protein mapping provided in data. - proteins_worker : array_like, 1 dimension, nonnegative ints A 1d integer array of length n_proteins containing the indices (in the original dataset) of the proteins assigned to the given worker. - peptides_worker : array_like, 1 dimension, nonnegative ints A 1d integer array of length n_peptides containing the indices (in the original dataset) of the peptides assigned to the given worker. ''' # Determine whether algorithm is running with supervision try: supervised = cfg['priors']['supervised'] except KeyError: print >> sys.stderr, 'Defaulting to unsupervised algorithm' supervised = False # If supervised, determine whether to model distribution of concentrations # If this is False, prior on $\beta_1$ is scaled by $|\beta_1|^{n_{mis}}$. if supervised: try: concentration_dist = cfg['priors']['concentration_dist'] except KeyError: print >> sys.stderr, 'Defaulting to flat prior on concentrations' concentration_dist = False # Get information on peptide features if they're available have_peptide_features = cfg['priors'].has_key('path_peptide_features') if have_peptide_features: n_peptide_features = data['peptide_features_worker'].shape[1] else: n_peptide_features = 0 # Extract proposal DFs try: prop_df_y_mis = cfg['settings']['prop_df_y_mis'] except KeyError: prop_df_y_mis = 5.0 # Create references to relevant data entries in local namespace mapping_peptides = data['mapping_peptides'] intensities_obs = data['intensities_obs'] mapping_states_obs = data['mapping_states_obs'] # Data specific to the semi-supervised algorithm if supervised: known_concentrations = data['known_concentrations'] mapping_known_concentrations = data['mapping_known_concentrations'] # Extract dimensions from input # Number of iterations from cfg n_iterations = cfg['settings']['n_iterations'] # Number of peptides and proteins from mapping_peptides n_peptides = np.size(mapping_peptides) n_proteins = 1 + np.max(mapping_peptides) # Compute tabulations that are invariant across iterations # Total number of observed states n_obs_states = np.size(intensities_obs) # Tabulate peptides per protein n_peptides_per_protein = np.bincount(mapping_peptides) peptides_obs = np.unique(mapping_states_obs) n_obs_peptides_per_protein = np.bincount(mapping_peptides[peptides_obs], minlength=n_proteins) # Tabulate number of observed states per peptide n_obs_states_per_peptide = np.bincount(mapping_states_obs, minlength=n_peptides) # Sum observed intensities per peptide total_intensity_obs_per_peptide = np.bincount(mapping_states_obs, weights=intensities_obs, minlength=n_peptides) # Allocate data structures for draws # Peptide- and protein-level means gamma_draws = np.empty((n_iterations, n_peptides)) mu_draws = np.empty((n_iterations, n_proteins)) # Concentrations, if supervised if supervised: concentration_draws = np.empty((n_iterations, n_proteins)) # Number of censored states per peptide n_cen_states_per_peptide_draws = np.zeros((n_iterations, n_peptides), dtype=np.integer) # State- and peptide-level variances sigmasq_draws = np.empty((n_iterations, n_proteins)) tausq_draws = np.empty((n_iterations, n_proteins)) # Instantiate GLM family for eta step try: glm_link_name = cfg["priors"]["glm_link"].title() except KeyError: print >> sys.stderr, "GLM link not specified; defaulting to logit" glm_link_name = "Logit" glm_link = getattr(glm.links, glm_link_name) glm_family = glm.families.Binomial(link=glm_link) # Setup data structure for shared parameters/hyperparameters sync # Layout: # - 0:2 : shape_sigmasq, rate_sigmasq # - 2:4 : shape_tausq, rate_tausq # - 4:6 : r, lmbda # - 6:8 : eta # - 8 : p_rnd_cen # If supervised, 4 additional entries are used: # - 9:11: beta # - 11 : mean_concentration # - 12 : prec_concentration params_shared = np.empty(9 + 4 * supervised, dtype=np.double) # Prepare to receive tasks working = True status = MPI.Status() t = np.array(0) # Primary send-receive loop for MCMC iterations while working: # Receive iteration and task information comm.Recv([t, MPI.INT], source=MPIROOT, tag=MPI.ANY_TAG, status=status) task = status.Get_tag() if task == TAGS['STOP']: working = False elif task == TAGS['SYNC']: # Synchronize shared parameters/hyperparameters comm.Bcast(params_shared, root=MPIROOT) shape_sigmasq, rate_sigmasq = params_shared[0:2] shape_tausq, rate_tausq = params_shared[2:4] r, lmbda = params_shared[4:6] eta = params_shared[6:8] p_rnd_cen = params_shared[8] if supervised: beta = params_shared[9:11] mean_concentration = params_shared[11] prec_concentration = params_shared[12] elif task == TAGS['INIT']: # Compute initial values for MCMC iterations # Protein-level means using mean observed intensity; excluding # missing peptides mu_draws[0] = ( np.bincount(mapping_peptides, total_intensity_obs_per_peptide / np.maximum(1, n_obs_states_per_peptide)) / n_obs_peptides_per_protein) mu_draws[0, n_obs_peptides_per_protein < 1] = np.nanmin(mu_draws[0]) # Peptide-level means using mean observed intensity; imputing # missing peptides as protein observed means gamma_draws[0] = mu_draws[0, mapping_peptides] gamma_draws[0, peptides_obs] = ( total_intensity_obs_per_peptide[peptides_obs] / n_obs_states_per_peptide[peptides_obs] ) # State- and peptide-level variances via inverse-gamma draws sigmasq_draws[0] = 1. / np.random.gamma(shape=shape_sigmasq, scale=1. / rate_sigmasq, size=n_proteins) tausq_draws[0] = 1. / np.random.gamma(shape=shape_tausq, scale=1. / rate_tausq, size=n_proteins) # Mapping from protein to peptide conditional variances for # convenience var_peptide_conditional = sigmasq_draws[0, mapping_peptides] # Number of states parameters from local MAP estimator based on # number of observed peptides; very crude, but not altogether # terrible. Note that this ignores the +1 location shift in the # actual n_states distribution. kwargs = { 'x': n_obs_states_per_peptide[n_obs_states_per_peptide > 0] - 1, 'transform': True} kwargs.update(cfg['priors']['n_states_dist']) r, lmbda = lib.map_estimator_nbinom(**kwargs) lmbda = 1. - lmbda # Combine local estimates at master for initialization. # Values synchronize at first iteration during SYNC task. comm.Reduce([np.array([r, lmbda]), MPI.DOUBLE], None, op=MPI.SUM, root=MPIROOT) if supervised: # Run Gibbs update on concentration-intensity coefficients using # noninformative prior. updates_parallel.rgibbs_worker_beta( comm=comm, concentrations=known_concentrations, gamma_bar=mu_draws[0, mapping_known_concentrations], tausq=tausq_draws[0, mapping_known_concentrations], n_peptides=n_peptides_per_protein[ mapping_known_concentrations], MPIROOT=MPIROOT) elif task == TAGS['LOCAL']: # (1) Draw missing data (n_cen and censored state intensities) given # all other parameters. Exact draw via rejection samplers. # (1a) Obtain p_int_cen per peptide and approximatations of censored # intensity posteriors. eta_0_effective = eta[0] eta_1_effective = eta[1] if n_peptide_features > 0: eta_0_effective += np.dot(data['peptide_features_worker'], eta[2:(2 + n_peptide_features)]) eta_1_effective += np.dot(data['peptide_features_worker'], eta[(2 + n_peptide_features):]) kwargs = {'eta_0': eta_0_effective, 'eta_1': eta_1_effective, 'mu': gamma_draws[t - 1], 'sigmasq': var_peptide_conditional, 'glm_link_name': glm_link_name} cen_dist = lib.characterize_censored_intensity_dist(**kwargs) # (1b) Draw number of censored states per peptide n_cen_states_per_peptide = lib.rncen( n_obs=n_obs_states_per_peptide, p_rnd_cen=p_rnd_cen, p_int_cen=cen_dist[ 'p_int_cen'], lmbda=lmbda, r=r) n_cen_states_per_peptide_draws[t] = n_cen_states_per_peptide # Update state-level counts n_states_per_peptide = (n_obs_states_per_peptide + n_cen_states_per_peptide) n_states_per_protein = np.bincount(mapping_peptides, weights=n_states_per_peptide) n_states = np.sum(n_states_per_peptide) # (1c) Draw censored intensities kwargs['n_cen'] = n_cen_states_per_peptide kwargs['p_rnd_cen'] = p_rnd_cen kwargs['propDf'] = prop_df_y_mis kwargs.update(cen_dist) intensities_cen, mapping_states_cen, W = lib.rintensities_cen( **kwargs) # Sum observed intensities per peptide total_intensity_cen_per_peptide = np.bincount( mapping_states_cen, weights=intensities_cen, minlength=n_peptides) # Compute mean intensities per peptide mean_intensity_per_peptide = ((total_intensity_obs_per_peptide + total_intensity_cen_per_peptide) / n_states_per_peptide) # (2) Update peptide-level mean parameters (gamma). Gibbs step. gamma_draws[t] = updates_serial.rgibbs_gamma( mu=mu_draws[t - 1, mapping_peptides], tausq=tausq_draws[t - 1, mapping_peptides], sigmasq=var_peptide_conditional, y_bar=mean_intensity_per_peptide, n_states=n_states_per_peptide) mean_gamma_by_protein = np.bincount(mapping_peptides, weights=gamma_draws[t]) mean_gamma_by_protein /= n_peptides_per_protein if supervised: # (3) Update concentrations given coefficients. Gibbs step. concentration_draws[t] = updates_serial.rgibbs_concentration( gamma_bar=mean_gamma_by_protein, tausq=tausq_draws[t - 1], n_peptides=n_peptides_per_protein, beta=beta, mean_concentration=mean_concentration, prec_concentration=prec_concentration) concentration_draws[t, mapping_known_concentrations] = \ known_concentrations mu_draws[t] = beta[0] + beta[1] * concentration_draws[t] else: # (3) Update protein-level mean parameters (mu). Gibbs step. mu_draws[t] = updates_serial.rgibbs_mu( gamma_bar=mean_gamma_by_protein, tausq=tausq_draws[t - 1], n_peptides=n_peptides_per_protein, **cfg['priors']['mu']) # (4) Update state-level variance parameters (sigmasq). Gibbs step. rss_by_state = ((intensities_obs - gamma_draws[t, mapping_states_obs]) ** 2) rss_by_protein = np.bincount(mapping_peptides[mapping_states_obs], weights=rss_by_state, minlength=n_proteins) rss_by_state = ((intensities_cen - gamma_draws[t, mapping_states_cen]) ** 2) rss_by_protein += np.bincount(mapping_peptides[mapping_states_cen], weights=rss_by_state, minlength=n_proteins) sigmasq_draws[t] = updates_serial.rgibbs_variances( rss=rss_by_protein, n=n_states_per_protein, prior_shape=shape_sigmasq, prior_rate=rate_sigmasq) # Mapping from protein to peptide conditional variances for # convenience var_peptide_conditional = sigmasq_draws[t, mapping_peptides] # (5) Update peptide-level variance parameters (tausq). Gibbs step. rss_by_peptide = ( gamma_draws[t] - mu_draws[t, mapping_peptides]) ** 2 rss_by_protein = np.bincount(mapping_peptides, weights=rss_by_peptide) tausq_draws[t] = updates_serial.rgibbs_variances( rss=rss_by_protein, n=n_peptides_per_protein, prior_shape=shape_tausq, prior_rate=rate_tausq) elif task == TAGS['SIGMA']: # Run distributed MH step for sigmasq hyperparameters updates_parallel.rmh_worker_variance_hyperparams( comm=comm, variances=sigmasq_draws[t], MPIROOT=MPIROOT) elif task == TAGS['TAU']: # Run distributed MH step for sigmasq hyperparameters updates_parallel.rmh_worker_variance_hyperparams( comm=comm, variances=tausq_draws[t], MPIROOT=MPIROOT) elif task == TAGS['NSTATES']: # Run distributed MH step for n_states hyperparameters updates_parallel.rmh_worker_nbinom_hyperparams( comm=comm, x=n_states_per_peptide - 1, r_prev=r, p_prev=1. - lmbda, MPIROOT=MPIROOT, **cfg['priors']['n_states_dist']) elif task == TAGS['ETA']: # Run distributed MH step for eta (coefficients in censoring model) # Build design matrix and response. Only using observed and # intensity-censored states. n_at_risk = n_obs_states + np.sum(W < 1) X = np.zeros((n_at_risk + n_peptide_features * 2, 2 + n_peptide_features * 2)) X[:n_at_risk, 0] = 1. X[:n_at_risk, 1] = np.r_[intensities_obs, intensities_cen[W < 1]] if n_peptide_features > 0: peptide_features_by_state = data['peptide_features_worker'][ np.r_[mapping_states_obs, mapping_states_cen[W < 1]] ] X[:n_at_risk, 2:(2 + n_peptide_features)] = \ peptide_features_by_state X[:n_at_risk, (2 + n_peptide_features):] = \ (peptide_features_by_state.T * X[:n_at_risk, 1]).T X[n_at_risk:, 2:] = np.eye(n_peptide_features * 2) y = np.zeros(n_at_risk + n_peptide_features * 2) y[:n_obs_states] = 1. if n_peptide_features > 0: y[n_at_risk:] = 0.5 w = np.ones_like(y) if n_peptide_features > 0: w[n_at_risk:(n_at_risk + n_peptide_features)] = ( cfg['priors']['eta_features']['primary_pseudoobs'] / (comm.Get_size() - 1.)) w[(n_at_risk + n_peptide_features):] = ( cfg['priors']['eta_features']['interaction_pseudoobs'] / (comm.Get_size() - 1.)) # Estimate GLM parameters. fit_eta = glm.glm(y=y, X=X, w=w, family=glm_family, info=True, cov=True) # Handle distributed computation draw updates_parallel.rmh_worker_glm_coef( comm=comm, b_prev=eta, family=glm_family, y=y, X=X, w=w, MPIROOT=MPIROOT, **fit_eta) elif task == TAGS['PRNDCEN']: # Run distributed Gibbs step for p_rnd_cen updates_parallel.rgibbs_worker_p_rnd_cen( comm=comm, n_rnd_cen=np.sum(W, dtype=np.int), n_states=n_states, MPIROOT=MPIROOT) elif task == TAGS['BETA']: # Run distributed Gibbs step for coefficients of # concentration-intensity relationship if concentration_dist: updates_parallel.rgibbs_worker_beta( comm=comm, concentrations=concentration_draws[t], gamma_bar=mean_gamma_by_protein, tausq=tausq_draws[t], n_peptides=n_peptides_per_protein, MPIROOT=MPIROOT) else: updates_parallel.rgibbs_worker_beta( comm=comm, concentrations=known_concentrations, gamma_bar=mean_gamma_by_protein[ mapping_known_concentrations], tausq=tausq_draws[t, mapping_known_concentrations], n_peptides=n_peptides_per_protein[ mapping_known_concentrations], MPIROOT=MPIROOT) elif task == TAGS['CONCENTRATION_DIST']: # Run distributed Gibbs step for hyperparameters of concentration # distribution updates_parallel.rgibbs_worker_concentration_dist( comm=comm, concentrations=concentration_draws[t], MPIROOT=MPIROOT) elif task == TAGS['SAVE']: # Construct path for worker-specific results path_worker = cfg['output']['pattern_results_worker'] % rank # Setup draws to return draws = {'mu': mu_draws, 'gamma': gamma_draws, 'sigmasq': sigmasq_draws, 'tausq': tausq_draws, 'n_cen_states_per_peptide': n_cen_states_per_peptide_draws, } if supervised: draws.update({'concentration': concentration_draws}) lib.write_to_hdf5( path=path_worker, compress=cfg['output']['compress'], draws=draws, mapping_peptides=data['mapping_peptides'], proteins_worker=data['proteins_worker']) # Setup draws to return draws = {'mu': mu_draws, 'gamma': gamma_draws, 'sigmasq': sigmasq_draws, 'tausq': tausq_draws, 'n_cen_states_per_peptide': n_cen_states_per_peptide_draws, } if supervised: draws.update({ 'concentration': concentration_draws}) return (draws, data['mapping_peptides'], data['proteins_worker'], data['peptides_worker'])