def split_data_by_image(self, test_fraction=0.5): """ Method for splitting data given a boolean mask. This method will designate full images as belonging to the train or test sets. Parameters ---------- test_fraction : float (optional) The fraction of images which will be reserved for testing. Returns ------- train : tuple test : tuple """ image_id = BaseModel.get_image_id(self.inputs) test_idx = np.random.random(image_id.max() + 1) <= test_fraction # Low image count edge case (mostly just for testing purposes) if True not in test_idx: test_idx[0] = True elif False not in test_idx: test_idx[0] = False test_idx = test_idx[image_id] if BaseModel.is_laue(self.inputs): train, test = self.split_laue_data_by_mask(test_idx) else: train, test = self.split_mono_data_by_mask(test_idx) #return self.get_tf_dataset(train), self.get_tf_dataset(test) return train, test
def test_laue_StudentTLikelihood(dof, laue_inputs): likelihood = StudentTLikelihood(dof)(laue_inputs) iobs = BaseModel.get_intensities(laue_inputs) sigiobs = BaseModel.get_uncertainties(laue_inputs) ipred = fake_ipred(laue_inputs) l_true = tfd.StudentT(dof, iobs, sigiobs) iconv = likelihood.convolve(ipred) test = likelihood.log_prob(ipred).numpy() expected = l_true.log_prob(iobs).numpy() nobs = BaseModel.get_harmonic_id(laue_inputs).max() + 1 test = likelihood.log_prob(ipred).numpy() expected = l_true.log_prob(iobs).numpy().T #The zero padded entries at the end of the input will disagree #with the expected values. This is fine, because they will not #contribute to the gradient test = test[:, :nobs] expected = expected[:, :nobs] assert np.array_equal(expected.shape, test.shape) assert np.allclose(expected, test) #Test batches larger than 1 ipred = np.concatenate((ipred, ipred, ipred), axis=0) likelihood.convolve(ipred).numpy() test = likelihood.log_prob(ipred).numpy() test = test[:, :nobs] assert np.array_equiv(expected, test)
def split_data_by_refl(self, test_fraction=0.5): """ Method for splitting data given a boolean mask. Parameters ---------- test_fraction : float (optional) The fraction of reflections which will be reserved for testing. Returns ------- train : tuple test : tuple """ if BaseModel.is_laue(self.inputs): harmonic_id = BaseModel.get_harmonic_id(self.inputs) test_idx = (np.random.random(harmonic_id.max() + 1) <= test_fraction)[harmonic_id] train, test = self.split_laue_data_by_mask(test_idx) #return self.get_tf_dataset(train), self.get_tf_dataset(test) return train, test test_idx = np.random.random(len(self.inputs[0])) <= test_fraction train, test = self.split_mono_data_by_mask(test_idx) #return self.get_tf_dataset(train), self.get_tf_dataset(test) return train, test
def test_laue(likelihood_model, prior_model, scaling_model, laue_inputs, mc_samples): nrefls = np.max(BaseModel.get_refl_id(laue_inputs)) + 1 n_images = np.max(BaseModel.get_image_id(laue_inputs)) + 1 #For the students dof = 4. if likelihood_model == StudentTLikelihood: likelihood = likelihood_model(dof) else: likelihood = likelihood_model() if prior_model == WilsonPrior: prior = prior_model( np.random.choice([True, False], nrefls), np.ones(nrefls).astype('float32'), ) elif prior_model == StudentTReferencePrior: prior = prior_model( np.ones(nrefls).astype('float32'), np.ones(nrefls).astype('float32'), dof) else: prior = prior_model( np.ones(nrefls).astype('float32'), np.ones(nrefls).astype('float32'), ) mlp_scaler = MLPScaler(2, 3) if scaling_model == HybridImageScaler: image_scaler = ImageScaler(n_images) scaler = HybridImageScaler(mlp_scaler, image_scaler) elif scaling_model == MLPScaler: scaler = mlp_scaler surrogate_posterior = tfd.TruncatedNormal( tf.Variable(prior.mean()), tfp.util.TransformedVariable( prior.stddev() / 10., tfb.Softplus(), ), low=1e-5, high=1e10, ) merger = VariationalMergingModel(surrogate_posterior, prior, likelihood, scaler, mc_samples) ipred = merger(laue_inputs) isfinite = np.all(np.isfinite(ipred.numpy())) assert isfinite merger = VariationalMergingModel(surrogate_posterior, prior, likelihood, scaler) merger.compile('Adam')
def test_mono_LaplaceLikelihood(mono_inputs): likelihood = LaplaceLikelihood()(mono_inputs) iobs = BaseModel.get_intensities(mono_inputs) sigiobs = BaseModel.get_uncertainties(mono_inputs) l_true = tfd.Laplace( tf.squeeze(iobs), tf.squeeze(sigiobs)/np.sqrt(2.), ) z = l_true.sample() assert np.allclose(likelihood.log_prob(z), l_true.log_prob(z))
def test_mono_StudentTLikelihood(dof, mono_inputs): likelihood = StudentTLikelihood(dof)(mono_inputs) iobs = BaseModel.get_intensities(mono_inputs) sigiobs = BaseModel.get_uncertainties(mono_inputs) l_true = tfd.StudentT( dof, tf.squeeze(iobs), tf.squeeze(sigiobs), ) z = l_true.sample() assert np.allclose(likelihood.log_prob(z), l_true.log_prob(z))
def get_predictions(self, model, inputs=None): """ Extract results from a surrogate_posterior. Parameters ---------- model : VariationalMergingModel A merging model from careless inputs : tuple (optional) Inputs for which to make the predictions if None, self.inputs is used. Returns ------- predictions : tuple A tuple of rs.DataSet objects containing the predictions for each ReciprocalASU contained in self.asu_collection """ if inputs is None: inputs = self.inputs refl_id = BaseModel.get_refl_id(inputs) iobs = BaseModel.get_intensities(inputs).flatten() sig_iobs = BaseModel.get_uncertainties(inputs).flatten() asu_id, H = self.asu_collection.to_asu_id_and_miller_index(refl_id) #ipred = model(inputs) ipred, sigipred = model.prediction_mean_stddev(inputs) h, k, l = H.T results = () for i, asu in enumerate(self.asu_collection): idx = asu_id == i idx = idx.flatten() output = rs.DataSet( { 'H': h[idx], 'K': k[idx], 'L': l[idx], 'Iobs': iobs[idx], 'SigIobs': sig_iobs[idx], 'Ipred': ipred[idx], 'SigIpred': sigipred[idx], }, cell=asu.cell, spacegroup=asu.spacegroup, merged=False, ).infer_mtz_dtypes().set_index(['H', 'K', 'L']) results += (output, ) return results
def test_mono_formatter( intensity_key, sigma_key, image_id_key, separate_outputs, anomalous, dmin, isigi_cutoff, positional_encoding_keys, encoding_bit_depth, mono_data_set, ): ds = mono_data_set.copy() f = MonoFormatter( intensity_key, sigma_key, image_id_key, metadata_keys, separate_outputs, anomalous, dmin, isigi_cutoff, positional_encoding_keys, encoding_bit_depth, ) inputs, rac = f([ds]) length = None for v in inputs: assert v.ndim == 2 assert v.dtype in ('float32', 'int64') if length is None: length = v.shape[0] assert v.shape[0] == length metadata = BaseModel.get_metadata(inputs)
def split(inputs, idx): harmonic_id = BaseModel.get_harmonic_id(inputs) result = () uni, inv = np.unique(harmonic_id[idx], return_inverse=True) for i, v in enumerate(inputs): name = BaseModel.get_name_by_index(i) if name in ('intensities', 'uncertainties'): v = v[uni] v = np.pad(v, [[0, len(inv) - len(v)], [0, 0]], constant_values=1.) elif name == 'harmonic_id': v = inv[:, None] else: v = v[idx.flatten(), ...] result += (v, ) return result
def get_tf_dataset(self, inputs=None): """ Pack a dataset in the way that keras and careless expect. Parameters ---------- inputs : tuple (optional) If None, self.inputs will be used """ if inputs is None: inputs = self.inputs inputs = tuple(inputs) iobs = BaseModel.get_intensities(inputs) sigiobs = BaseModel.get_uncertainties(inputs) packed = (inputs, iobs, sigiobs) tfds = tf.data.Dataset.from_tensor_slices(packed) return tfds.batch(len(iobs))
def test_laue_NormalLikelihood(laue_inputs): likelihood = NormalLikelihood()(laue_inputs) iobs = BaseModel.get_intensities(laue_inputs) sigiobs = BaseModel.get_uncertainties(laue_inputs) ipred = fake_ipred(laue_inputs) l_true = tfd.Normal(iobs, sigiobs) iconv = likelihood.convolve(ipred) test = likelihood.log_prob(ipred).numpy() expected = l_true.log_prob(iobs).numpy() assert np.array_equal(expected.shape, test.T.shape) assert np.allclose(expected, test.T) #Test batches larger than 1 ipred = np.concatenate((ipred, ipred, ipred), axis=0) likelihood.convolve(ipred).numpy() test = likelihood.log_prob(ipred).numpy() assert np.array_equiv(expected, test.T)
def pack_inputs(self, inputs_dict): """ inputs_dict : {k:v} where k corresponds to one of careless.models.base.BaseModel.input_index.keys() """ inputs = () for i in range(len(BaseModel.input_index)): k = BaseModel.get_name_by_index(i) if k in inputs_dict: inputs += (inputs_dict[k], ) else: break return inputs
def split_laue_data_by_mask(self, test_idx): """ Method for splitting laue data given a boolean mask. This method will split up the data and alter the harmonic_id column to reflect the decrease in size of the array. Parameters ---------- test_idx : array (boolean) Boolean array with length of inputs. Returns ------- train : tuple test : tuple """ harmonic_id = BaseModel.get_harmonic_id(self.inputs) # Let us just test that the boolean mask is valid for these data. # If it does not split observations, isect should be empty isect = np.intersect1d( harmonic_id[test_idx].flatten(), harmonic_id[~test_idx].flatten(), ) if len(isect) > 0: raise ValueError( f"test_idx splits harmonic observations with harmonic_id : {isect}" ) def split(inputs, idx): harmonic_id = BaseModel.get_harmonic_id(inputs) result = () uni, inv = np.unique(harmonic_id[idx], return_inverse=True) for i, v in enumerate(inputs): name = BaseModel.get_name_by_index(i) if name in ('intensities', 'uncertainties'): v = v[uni] v = np.pad(v, [[0, len(inv) - len(v)], [0, 0]], constant_values=1.) elif name == 'harmonic_id': v = inv[:, None] else: v = v[idx.flatten(), ...] result += (v, ) return result return split(self.inputs, ~test_idx), split(self.inputs, test_idx)
def fake_ipred(inputs): harmonic_id = BaseModel.get_harmonic_id(inputs).flatten() intensities = BaseModel.get_intensities(inputs).flatten() result = intensities[harmonic_id] / np.bincount(harmonic_id)[harmonic_id] return result[None, :].astype('float32')
def test_is_laue(laue_inputs, mono_inputs): assert BaseModel.is_laue(laue_inputs) assert not BaseModel.is_laue(mono_inputs)
def get_results(self, surrogate_posterior, inputs=None, output_parameters=True): """ Extract results from a surrogate_posterior. Parameters ---------- surrogate_posterior : tfd.Distribution A tensorflow_probability distribution or similar object with `mean` and `stddev` methods inputs : tuple (optional) Optionally use a different object from self.inputs to compute the redundancy of reflections. output_parameters : bool (optional) If True, output the parameters of the surrogate distribution in addition to the moments. Returns ------- results : tuple A tuple of rs.DataSet objects containing the results corresponding to each ReciprocalASU contained in self.asu_collection """ if inputs is None: inputs = self.inputs F = surrogate_posterior.mean().numpy() SigF = surrogate_posterior.stddev().numpy() params = None if output_parameters: params = {} for k in sorted(surrogate_posterior.parameter_properties()): v = surrogate_posterior.parameters[k] numpify = lambda x: tf.convert_to_tensor(x).numpy() params[k] = numpify(v).flatten() * np.ones(len(F), dtype='float32') asu_id, H = self.asu_collection.to_asu_id_and_miller_index( np.arange(len(F))) h, k, l = H.T refl_id = BaseModel.get_refl_id(inputs) N = np.bincount(refl_id.flatten(), minlength=len(F)).astype('float32') results = () for i, asu in enumerate(self.asu_collection): idx = asu_id == i idx = idx.flatten() output = rs.DataSet( { 'H': h[idx], 'K': k[idx], 'L': l[idx], 'F': F[idx], 'SigF': SigF[idx], 'N': N[idx], }, cell=asu.cell, spacegroup=asu.spacegroup, merged=True, ).infer_mtz_dtypes().set_index(['H', 'K', 'L']) if params is not None: for key in sorted(params.keys()): val = params[key] output[key] = rs.DataSeries(val[idx], index=output.index, dtype='R') # Remove unobserved refls output = output[output.N > 0] # Reformat anomalous data if asu.anomalous: output = output.unstack_anomalous() # PHENIX will expect the sf / error keys in a particular order. anom_keys = [ 'F(+)', 'SigF(+)', 'F(-)', 'SigF(-)', 'N(+)', 'N(-)' ] reorder = anom_keys + [ key for key in output if key not in anom_keys ] output = output[reorder] results += (output, ) return results
def test_getters(laue_inputs, mono_inputs): for inputs in laue_inputs, mono_inputs: if BaseModel.is_laue(inputs): BaseModel.get_harmonic_id(inputs) BaseModel.get_wavelength(inputs) BaseModel.get_image_id(inputs) BaseModel.get_intensities(inputs) BaseModel.get_metadata(inputs) BaseModel.get_refl_id(inputs) BaseModel.get_uncertainties(inputs)
def build_model(self, parser=None, surrogate_posterior=None, prior=None, likelihood=None, scaling_model=None, mc_sample_size=None): """ Build the model specified in parser, a careless.parser.parser.parse_args() result. Optionally override any of the parameters taken by the VariationalMergingModel constructor. The `parser` parameter is required if self.parser is not set. """ from careless.models.merging.surrogate_posteriors import TruncatedNormal from careless.models.merging.variational import VariationalMergingModel from careless.models.scaling.image import HybridImageScaler, ImageScaler from careless.models.scaling.nn import MLPScaler if parser is None: parser = self.parser if parser is None: raise ValueError("No parser supplied, but self.parser is unset") if parser.type == 'poly': from careless.models.likelihoods.laue import NormalLikelihood, StudentTLikelihood elif parser.type == 'mono': from careless.models.likelihoods.mono import NormalLikelihood, StudentTLikelihood if prior is None: prior = self.get_wilson_prior(parser.wilson_prior_b) loc, scale = prior.mean(), prior.stddev() / 10. low = (1e-32 * self.asu_collection.centric).astype('float32') if surrogate_posterior is None: surrogate_posterior = TruncatedNormal.from_loc_and_scale( loc, scale, low) if likelihood is None: dof = parser.studentt_likelihood_dof if dof is None: likelihood = NormalLikelihood() else: likelihood = StudentTLikelihood(dof) if scaling_model is None: mlp_width = parser.mlp_width if mlp_width is None: mlp_width = BaseModel.get_metadata(self.inputs).shape[-1] mlp_scaler = MLPScaler(parser.mlp_layers, mlp_width) if parser.use_image_scales: n_images = np.max(BaseModel.get_image_id(self.inputs)) + 1 image_scaler = ImageScaler(n_images) scaling_model = HybridImageScaler(mlp_scaler, image_scaler) else: scaling_model = mlp_scaler model = VariationalMergingModel(surrogate_posterior, prior, likelihood, scaling_model, parser.mc_samples) opt = tf.keras.optimizers.Adam( parser.learning_rate, parser.beta_1, parser.beta_2, ) model.compile(opt) return model
def test_by_index(): for k, v in BaseModel.input_index.items(): assert BaseModel.get_index_by_name(k) == v