def test_value_counts_with_normalize(self, data): # GH 33172 data = data[:10].unique() values = np.array(data[~data.isna()]) result = (rs.DataSeries( data, dtype=data.dtype).value_counts(normalize=True).sort_index()) expected = rs.DataSeries([1 / len(values)] * len(values), index=result.index) self.assert_series_equal(result, expected)
def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: other = all_data[~all_data.isna()] else: other = all_data result = rs.DataSeries(all_data).value_counts( dropna=dropna).sort_index() expected = rs.DataSeries(other).value_counts( dropna=dropna).sort_index() self.assert_series_equal(result, expected)
def test_combine_le(self, data_repeated): """ pd.Series.combine() returns Series with original dtype when an ExtensionArray is used. This test needed to be updated to reflect that behavior. """ orig_data1, orig_data2 = data_repeated(2) s1 = rs.DataSeries(orig_data1) s2 = rs.DataSeries(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 <= x2) expected = rs.DataSeries( [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], dtype=s1.dtype) self.assert_series_equal(result, expected)
def assign_resolution_bins(self, bins=20, inplace=False, return_labels=True): """ Assign reflections in DataSet to resolution bins. Parameters ---------- bins : int Number of bins inplace : bool Whether to add the column in place or return a copy return_labels : bool Whether to return a list of labels corresponding to the edges of each resolution bin Returns ------- (DataSet, list) or DataSet """ dHKL = self.compute_dHKL()["dHKL"] assignments, labels = bin_by_percentile(dHKL, bins=bins, ascending=False) self["bin"] = rs.DataSeries(assignments, dtype="I", index=self.index) if return_labels: return self, labels else: return self
def test_value_counts(self, all_data, dropna): """ Rewrite original test to use rs.DataSeries instead of pd.Series """ all_data = all_data[:10] if dropna: other = all_data[~all_data.isna()] else: other = all_data result = rs.DataSeries(all_data).value_counts( dropna=dropna).sort_index() expected = rs.DataSeries(other).value_counts( dropna=dropna).sort_index() print(result) print(expected) self.assert_series_equal(result, expected)
def compute_dHKL(self, inplace=False): """ Compute the real space lattice plane spacing, d, associated with the HKL indices in the object. Parameters ---------- inplace : bool Whether to add the column in place or return a copy """ dHKL = compute_dHKL(self.get_hkls(), self.cell) self['dHKL'] = rs.DataSeries(dHKL, dtype='R', index=self.index) return self
def from_structurefactor(sfs): """ Convert complex structure factors into structure factor amplitudes and phases Parameters ---------- sfs : np.ndarray array of complex structure factors to be converted Returns ------- (sf, phase) : tuple of DataSeries Tuple of DataSeries for the structure factor amplitudes and phases corresponding to the provided complex structure factors """ index = None if isinstance(sfs, rs.DataSeries): index = sfs.index sf = rs.DataSeries(np.abs(sfs), index=index, name="F").astype("SFAmplitude") phase = rs.DataSeries(np.angle(sfs, deg=True), index=index, name="Phi").astype("Phase") return sf, phase
def test_constructor(data, name, dtype): """Test constructor of DataSeries""" ds = rs.DataSeries(data, name=name, dtype=dtype) assert isinstance(ds, rs.DataSeries) assert ds.name == name if data is None: assert len(ds) == 0 assert np.array_equal(ds.to_numpy(dtype=float), np.array([], dtype=float)) else: assert len(ds) == len(data) assert np.array_equal(ds.to_numpy(dtype=float), np.array(data, dtype=float))
def test_from_friedel_dtype(dtype_all): """Test DataSeries.from_friedel_dtype""" ds = rs.DataSeries(np.arange(0, 10), dtype=dtype_all[0]()) result = ds.from_friedel_dtype() expected = dtype_all[0] if (isinstance(ds.dtype, rs.StandardDeviationFriedelSFDtype) or isinstance(ds.dtype, rs.StandardDeviationFriedelIDtype)): expected = rs.StandardDeviationDtype elif isinstance(ds.dtype, rs.FriedelIntensityDtype): expected = rs.IntensityDtype elif isinstance(ds.dtype, rs.FriedelStructureFactorAmplitudeDtype): expected = rs.StructureFactorAmplitudeDtype assert isinstance(result.dtype, expected) assert np.array_equal(result.to_numpy(), np.arange(0, 10))
def test_constructor_expanddim(data, series_name, frame_name, dtype): """Test DataSeries.to_frame()""" ds = rs.DataSeries(data, name=series_name, dtype=dtype) d = ds.to_frame(name=frame_name) assert isinstance(d, rs.DataSet) assert len(d.columns) == 1 assert isinstance(d.dtypes[0], type(ds.dtype)) # Test hierarchy for column naming if frame_name: assert d.columns[0] == frame_name elif series_name: assert d.columns[0] == series_name else: assert d.columns[0] == 0
def compute_multiplicity(self, inplace=False, include_centering=True): """ Compute the multiplicity of reflections in DataSet. A new column of floats, "EPSILON", is added to the object. Parameters ---------- inplace : bool Whether to add the column in place or to return a copy include_centering : bool Whether to include centering operations in the multiplicity calculation. The default is to include them. """ epsilon = compute_structurefactor_multiplicity(self.get_hkls(), self.spacegroup, include_centering) self['EPSILON'] = rs.DataSeries(epsilon, dtype='I', index=self.index) return self
def value_counts(self, dropna=True): """ Returns a DataSeries containing counts of each category. Every category will have an entry, even those with a count of 0. Parameters ---------- dropna : bool, default True Don't include counts of NaN. Returns ------- counts : DataSeries """ from pandas import Index import reciprocalspaceship as rs # compute counts on the data with no nans mask = np.isnan(self.data) data = self.data[~mask] value_counts = Index(data).value_counts() array = value_counts.values # TODO(extension) # if we have allow Index to hold an ExtensionArray # this is easier index = value_counts.index.astype(object) # if we want nans, count the mask if not dropna: # TODO(extension) # appending to an Index *always* infers # w/o passing the dtype array = np.append(array, [mask.sum()]) index = Index( np.concatenate([ index.values, np.array([self.dtype.na_value], dtype=object) ]), dtype=object, ) return rs.DataSeries(array, index=index)
def to_structurefactor(self, sf_key, phase_key): """ Convert structure factor amplitudes and phases to complex structure factors Parameters ---------- sf_key : str Column label for structure factor amplitudes phase_key : str Column label for phases Returns ------- rs.DataSeries Complex structure factors See Also -------- DataSet.from_structurefactor : Convert complex structure factor to amplitude and phase """ sfs = utils.to_structurefactor(self[sf_key], self[phase_key]) return rs.DataSeries(sfs, index=self.index)
import pytest import unittest import numpy as np import reciprocalspaceship as rs @pytest.fixture(params=[ 50.0, -50.0, 250.0, -250.0, np.array([50., -50., 250., -250.]), rs.DataSeries([50., -50., 250., -250.], dtype="Phase") ]) def phase_deg(request): """Yields phases (in degrees) for testing""" return request.param @pytest.mark.parametrize("deg", [True, False]) def test_canonicalize_phases(phase_deg, deg): # Test canonicalize_phases expected_phase = ((phase_deg + 180.) % 360.) - 180. if not deg: phase_deg = np.deg2rad(phase_deg) expected_phase = np.deg2rad(expected_phase) p = rs.utils.canonicalize_phases(phase_deg, deg) if isinstance(p, rs.DataSeries): p = p.to_numpy(np.float32) expected_phase = expected_phase.to_numpy(np.float32) assert np.allclose(p, expected_phase)
def scale_merged_intensities(ds, intensity_key, sigma_key, output_columns=None, dropna=True, inplace=False, mean_intensity_method="isotropic", bins=100, bw=2.0): """ Scales merged intensities using Bayesian statistics in order to estimate structure factor amplitudes. This method is based on the approach by French and Wilson [1]_, and is useful for improving the estimates of negative and small intensities in order to ensure that structure factor moduli are strictly positive. The mean and standard deviation of acentric reflections are computed analytically from a truncated normal distribution. The mean and standard deviation for centric reflections are computed by numerical integration of the posterior intensity distribution under a Wilson prior, and then by interpolation with a kernel smoother. Notes ----- This method follows the same approach as French and Wilson, with the following modifications: * Numerical integration under a Wilson prior is used to estimate the mean and standard deviation of centric reflections at runtime, rather than using precomputed results and a look-up table. * Same procedure is used for all centric reflections; original work handled high intensity centric reflections differently. Parameters ---------- ds : DataSet Input DataSet containing columns with intensity_key and sigma_key labels intensity_key : str Column label for intensities to be scaled sigma_key : str Column label for error estimates of intensities being scaled output_columns : list or tuple of column names Column labels to be added to ds for recording scaled I, SigI, F, and SigF, respectively. output_columns must have len=4. dropna : bool Whether to drop reflections with NaNs in intensity_key or sigma_key columns inplace : bool Whether to modify the DataSet in place or create a copy mean_intensity_method : str ["isotropic" or "anisotropic"] If "isotropic", mean intensity is determined by resolution bin. If "anisotropic", mean intensity is determined by Miller index using provided bandwidth. bins : int or array Either an integer number of n bins. Or an array of bin edges with shape==(n, 2). Only affects output if mean_intensity_method is \"isotropic\". bw : float Bandwidth to use for computing anisotropic mean intensity. This parameter controls the distance that each reflection impacts in reciprocal space. Only affects output if mean_intensity_method is \"anisotropic\". Returns ------- DataSet DataSet with 4 additional columns corresponding to scaled I, SigI, F, and SigF. References ---------- .. [1] French S. and Wilson K. \"On the Treatment of Negative Intensity Observations,\" Acta Cryst. A34 (1978). """ if not inplace: ds = ds.copy() # Sanitize input or check for invalid reflections if dropna: ds.dropna(subset=[intensity_key, sigma_key], inplace=True) else: if ds[[intensity_key, sigma_key]].isna().to_numpy().any(): raise ValueError( f"Input {ds.__class__.__name__} contains NaNs " f"in columns '{intensity_key}' and/or 'sigma_key'. " f"Please fix these input values, or run with dropna=True") # Accessory columns needed for algorithm if 'dHKL' not in ds: ds.compute_dHKL(inplace=True) if 'CENTRIC' not in ds: ds.label_centrics(inplace=True) if output_columns is None: output_columns = ["FW-I", "FW-SIGI", "FW-F", "FW-SIGF"] outputI, outputSigI, outputF, outputSigF = output_columns multiplicity = ds.compute_multiplicity().EPSILON.to_numpy() # Input data for posterior calculations I, Sig = ds[intensity_key].to_numpy(), ds[sigma_key].to_numpy() if mean_intensity_method == "isotropic": dHKL = ds['dHKL'].to_numpy(dtype=np.float64) Sigma = mean_intensity_by_resolution(I / multiplicity, dHKL, bins) * multiplicity elif mean_intensity_method == "anisotropic": Sigma = mean_intensity_by_miller_index(I / multiplicity, ds.get_hkls(), bw) * multiplicity # Initialize outputs ds[outputI] = 0. ds[outputSigI] = 0. mean_I, std_I, mean_F, std_F = _french_wilson_posterior_quad( ds[intensity_key].to_numpy(), ds[sigma_key].to_numpy(), Sigma, ds.CENTRIC.to_numpy()) # Convert dtypes of columns to MTZDtypes ds[outputI] = rs.DataSeries(mean_I, index=ds.index, dtype="Intensity") ds[outputSigI] = rs.DataSeries(std_I, index=ds.index, dtype="Stddev") ds[outputF] = rs.DataSeries(mean_F, index=ds.index, dtype="SFAmplitude") ds[outputSigF] = rs.DataSeries(std_F, index=ds.index, dtype="Stddev") return ds
assert data_fmodel.index.names != index_names for c in columns: if drop: assert c not in result.columns assert c not in data_fmodel.columns else: assert c in result.columns assert c not in data_fmodel.columns assert cache == list(result._index_dtypes.keys()) assert cache != list(data_fmodel._index_dtypes.keys()) @pytest.mark.parametrize( "keys", [["H", "K", "L"], ["H"], "H", np.arange(168), [np.arange(168)], ["H", np.arange(168), "K"], rs.DataSeries(np.arange(168), name="temp"), [ rs.DataSeries(np.arange(168), name="temp", dtype="I"), rs.DataSeries(np.arange(168), name="temp2", dtype="I") ]]) def test_set_index_cache(data_fmodel, keys): """ Test DataSet.set_index() correctly sets DataSet._index_dtypes attribute Note ---- There are 168 rows in data_fmodel """ temp = data_fmodel.reset_index() result = temp.set_index(keys)
@pytest.fixture def na_cmp(): return lambda x, y: pd.isna(x) and pd.isna(y) @pytest.fixture(params=[True, False]) def box_in_series(request): """Whether to box the data in a Series""" return request.param @pytest.fixture( params=[ lambda x: 1, lambda x: [1] * len(x), lambda x: rs.DataSeries([1] * len(x)), lambda x: x, ], ids=["scalar", "list", "series", "object"], ) def groupby_apply_op(request): """ Functions to test groupby.apply(). """ return request.param @pytest.fixture(params=["ffill", "bfill"]) def fillna_method(request): """ Parametrized fixture giving method parameters 'ffill' and 'bfill' for
def run_careless(parser): # We defer all inputs to make sure the parser has priority in modifying tf parameters import tensorflow as tf import numpy as np import reciprocalspaceship as rs from careless.io.manager import DataManager from careless.io.formatter import MonoFormatter,LaueFormatter from careless.models.base import BaseModel from careless.models.merging.surrogate_posteriors import TruncatedNormal from careless.models.merging.variational import VariationalMergingModel from careless.models.scaling.image import HybridImageScaler,ImageScaler from careless.models.scaling.nn import MLPScaler if parser.type == 'poly': df = LaueFormatter.from_parser(parser) elif parser.type == 'mono': df = MonoFormatter.from_parser(parser) inputs,rac = df.format_files(parser.reflection_files) dm = DataManager(inputs, rac, parser=parser) if parser.test_fraction is not None: train,test = dm.split_data_by_refl(parser.test_fraction) else: train,test = dm.inputs,None model = dm.build_model() history = model.train_model( tuple(map(tf.convert_to_tensor, train)), parser.iterations, message="Training", ) for i,ds in enumerate(dm.get_results(model.surrogate_posterior, inputs=train)): filename = parser.output_base + f'_{i}.mtz' ds.write_mtz(filename) filename = parser.output_base + f'_history.csv' history = rs.DataSet(history).to_csv(filename, index_label='step') model.save_weights(parser.output_base + '_weights') import pickle with open(parser.output_base + "_data_manager.pickle", "wb") as out: pickle.dump(dm, out) predictions_data = None if test is not None: for file_id, (ds_train, ds_test) in enumerate(zip( dm.get_predictions(model, train), dm.get_predictions(model, test), )): ds_train['test'] = rs.DataSeries(0, index=ds_train.index, dtype='I') ds_test['test'] = rs.DataSeries(1, index=ds_test.index, dtype='I') filename = parser.output_base + f'_predictions_{file_id}.mtz' ds_train.append(ds_test).write_mtz(filename) else: for file_id, ds_train in enumerate(dm.get_predictions(model, train)): ds_train['test'] = rs.DataSeries(0, index=ds_train.index, dtype='I') filename = parser.output_base + f'_predictions_{file_id}.mtz' ds_train.write_mtz(filename) if parser.merge_half_datasets: scaling_model = model.scaling_model scaling_model.trainable = False xval_data = [None] * len(dm.asu_collection) for repeat in range(parser.half_dataset_repeats): for half_id, half in enumerate(dm.split_data_by_image()): model = dm.build_model(scaling_model=scaling_model) history = model.train_model( tuple(map(tf.convert_to_tensor, half)), parser.iterations, message=f"Merging repeat {repeat+1} half {half_id+1}", ) for file_id,ds in enumerate(dm.get_results(model.surrogate_posterior, inputs=half)): ds['repeat'] = rs.DataSeries(repeat, index=ds.index, dtype='I') ds['half'] = rs.DataSeries(half_id, index=ds.index, dtype='I') if xval_data[file_id] is None: xval_data[file_id] = ds else: xval_data[file_id] = xval_data[file_id].append(ds) for file_id, ds in enumerate(xval_data): filename = parser.output_base + f'_xval_{file_id}.mtz' ds.write_mtz(filename) if parser.embed: from IPython import embed embed(colors='Linux')
def get_results(self, surrogate_posterior, inputs=None, output_parameters=True): """ Extract results from a surrogate_posterior. Parameters ---------- surrogate_posterior : tfd.Distribution A tensorflow_probability distribution or similar object with `mean` and `stddev` methods inputs : tuple (optional) Optionally use a different object from self.inputs to compute the redundancy of reflections. output_parameters : bool (optional) If True, output the parameters of the surrogate distribution in addition to the moments. Returns ------- results : tuple A tuple of rs.DataSet objects containing the results corresponding to each ReciprocalASU contained in self.asu_collection """ if inputs is None: inputs = self.inputs F = surrogate_posterior.mean().numpy() SigF = surrogate_posterior.stddev().numpy() params = None if output_parameters: params = {} for k in sorted(surrogate_posterior.parameter_properties()): v = surrogate_posterior.parameters[k] numpify = lambda x: tf.convert_to_tensor(x).numpy() params[k] = numpify(v).flatten() * np.ones(len(F), dtype='float32') asu_id, H = self.asu_collection.to_asu_id_and_miller_index( np.arange(len(F))) h, k, l = H.T refl_id = BaseModel.get_refl_id(inputs) N = np.bincount(refl_id.flatten(), minlength=len(F)).astype('float32') results = () for i, asu in enumerate(self.asu_collection): idx = asu_id == i idx = idx.flatten() output = rs.DataSet( { 'H': h[idx], 'K': k[idx], 'L': l[idx], 'F': F[idx], 'SigF': SigF[idx], 'N': N[idx], }, cell=asu.cell, spacegroup=asu.spacegroup, merged=True, ).infer_mtz_dtypes().set_index(['H', 'K', 'L']) if params is not None: for key in sorted(params.keys()): val = params[key] output[key] = rs.DataSeries(val[idx], index=output.index, dtype='R') # Remove unobserved refls output = output[output.N > 0] # Reformat anomalous data if asu.anomalous: output = output.unstack_anomalous() # PHENIX will expect the sf / error keys in a particular order. anom_keys = [ 'F(+)', 'SigF(+)', 'F(-)', 'SigF(-)', 'N(+)', 'N(-)' ] reorder = anom_keys + [ key for key in output if key not in anom_keys ] output = output[reorder] results += (output, ) return results
import pytest import numpy as np import reciprocalspaceship as rs import gemmi @pytest.mark.parametrize( "sfs_phases", [(np.random.rand(10), np.random.rand(10)), (list(np.random.rand(10)), list(np.random.rand(10))), ([], []), (1.0, 90.), (rs.DataSeries(np.linspace(1, 20, 10), name="F", dtype="SFAmplitude"), rs.DataSeries(np.random.rand(10), name="Phi", dtype="Phase"))]) def test_to_structurefactor(sfs_phases): """ Test rs.utils.to_structurefactor() returns complex structure factors when given amplitudes and phases. """ sfamps = sfs_phases[0] phases = sfs_phases[1] sfs = rs.utils.to_structurefactor(sfamps, phases) # Handle DataSeries if isinstance(sfamps, rs.DataSeries): sfamps = sfamps.to_numpy() if isinstance(phases, rs.DataSeries): phases = phases.to_numpy() reference = sfamps * np.exp(1j * np.deg2rad(phases)) assert np.iscomplexobj(sfs) assert np.isclose(sfs, reference).all()
import pytest import reciprocalspaceship as rs from pandas.testing import assert_series_equal @pytest.mark.parametrize("dataseries", [ (rs.DataSeries(range(10), dtype=rs.PhaseDtype()), "P"), (rs.DataSeries(range(10), dtype=rs.HKLIndexDtype()), "H"), (rs.DataSeries(range(10), name="Phi", dtype=rs.PhaseDtype()), "P"), (rs.DataSeries(range(10), name=None), "I"), (rs.DataSeries(range(10), name=None, dtype=float), "R"), (rs.DataSeries(range(10), name="blah", dtype=float), "R"), (rs.DataSeries(range(10), name=None), "I"), (rs.DataSeries(range(10), name=None, dtype=float), "R"), (rs.DataSeries(["h"] * 3, name=None, dtype=object), object), (rs.DataSeries(["h"] * 3, name="blah", dtype=object), object), (rs.DataSeries(range(10), name="H"), "H"), (rs.DataSeries(range(10), name="K"), "H"), (rs.DataSeries(range(10), name="L"), "H"), (rs.DataSeries(range(10), name="I"), "J"), (rs.DataSeries(range(10), name="IMEAN"), "J"), (rs.DataSeries(range(10), name="SIGIMEAN"), "Q"), (rs.DataSeries(range(10), name="SIGI"), "Q"), (rs.DataSeries(range(10), name="SigI"), "Q"), (rs.DataSeries(range(10), name="SigF"), "Q"), (rs.DataSeries(range(10), name="SIGF"), "Q"), (rs.DataSeries(range(10), name="F"), "F"), (rs.DataSeries(range(10), name="F-obs"), "F"), (rs.DataSeries(range(10), name="ANOM"), "F"), (rs.DataSeries(range(10), name="PHANOM"), "P"), (rs.DataSeries(range(10), name="PHI"), "P"),
def test_float_nan_conversion(data_int, dtype_floats): """Test that float dtypes can support conversion of data with NaNs""" x = rs.DataSeries(data_int) x.iloc[0] = pd.NaT x = x.astype(dtype_floats[0]()) assert x.isna()[0]
def test_astype_singleletter(dtype_all): """Test DataSeries.astype() with single-letter mtztype""" expected = rs.DataSeries(np.arange(0, 100), dtype=dtype_all[0]()) result = expected.astype(expected.dtype.mtztype) assert_series_equal(result, expected)
def test_astype_name(dtype_all): """Test DataSeries.astype() with name""" expected = rs.DataSeries(np.arange(0, 100), dtype=dtype_all[0]()) result = expected.astype(expected.dtype.name) assert_series_equal(result, expected) assert expected.dtype.name == str(result.dtype)