def read_clinical_data(clinical_data_directory: PathLike) -> DataFrame: import pandas as pd dataframe = find_clinical_data(clinical_data_directory) if dataframe is None: raise FileNotFoundError("Clinical data not found") # Compute participant and session IDs. dataframe = dataframe.rename_axis(index={ "loni_id": "participant_id", "visit_number": "session_id" }) dataframe.index = dataframe.index.map(lambda x: ( f"sub-NIFD{x[0].replace('_', '')}", f"ses-M{(6 * (x[1] - 1)):02d}")) # Keep relevant columns and rename them. dataframe = (dataframe[[ "dx", "site", "education", "race", "cdr_box_score", "mmse_tot" ]].rename(columns={ "dx": "diagnosis", "cdr_box_score": "cdr", "mmse_tot": "mmse" }).astype( dtype={ "diagnosis": pd.CategoricalDtype( ["BV", "CON", "L_SD", "PATIENT (OTHER)", "PNFA", "SV"]), "site": pd.CategoricalDtype(["UCSF", "MAYO", "MGH"]), "education": pd.Int64Dtype(), "race": pd.Int64Dtype(), "cdr": pd.Float64Dtype(), "mmse": pd.Float64Dtype(), }).replace({ "education": { 99: pd.NA }, "race": { 50: pd.NA, 99: pd.NA } })) # Keep positive MMSE values only. dataframe.mmse = dataframe.mmse.mask(dataframe.mmse < 0) return dataframe
def test_dwd_observation_data_result_missing_data(): """Test for DataFrame having empty values for dates where the station should not have values""" Settings.tidy = True Settings.humanize = True Settings.si_units = True request = DwdObservationRequest( parameter=[DwdObservationDataset.CLIMATE_SUMMARY], resolution=DwdObservationResolution.DAILY, start_date="1933-12-27", # few days before official start end_date="1934-01-04", # few days after official start, ).filter_by_station_id(station_id=[1048], ) # Leave only one column to potentially contain NaN which is VALUE df = request.values.all().df.drop("quality", axis=1) df_1933 = df[df["date"].dt.year == 1933] df_1934 = df[df["date"].dt.year == 1934] assert not df_1933.empty and df_1933.dropna().empty assert not df_1934.empty and not df_1934.dropna().empty request = DwdObservationRequest( parameter=DwdObservationParameter.HOURLY.TEMPERATURE_AIR_MEAN_200, resolution=DwdObservationResolution.HOURLY, start_date= "2020-06-09 12:00:00", # no data at this time (reason unknown) end_date="2020-06-09 12:00:00", ).filter_by_station_id(station_id=["03348"], ) df = request.values.all().df assert_frame_equal( df, pd.DataFrame({ "station_id": pd.Categorical(["03348"]), "dataset": pd.Categorical(["temperature_air"]), "parameter": pd.Categorical(["temperature_air_mean_200"]), "date": [datetime(2020, 6, 9, 12, 0, 0, tzinfo=pytz.UTC)], "value": pd.Series([pd.NA], dtype=pd.Float64Dtype()).astype(float), "quality": pd.Series([pd.NA], dtype=pd.Float64Dtype()).astype(float), }), check_categorical=False, )
def test_astype(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] self.assert_eq(pser.astype(int), psser.astype(int)) self.assert_eq(pser.astype(float), psser.astype(float)) self.assert_eq(pser.astype(np.float32), psser.astype(np.float32)) self.assert_eq(pser.astype(np.int32), psser.astype(np.int32)) self.assert_eq(pser.astype(np.int16), psser.astype(np.int16)) self.assert_eq(pser.astype(np.int8), psser.astype(np.int8)) self.assert_eq(pser.astype(str), psser.astype(str)) self.assert_eq(pser.astype(bool), psser.astype(bool)) self.assert_eq(pser.astype("category"), psser.astype("category")) cat_type = CategoricalDtype(categories=[2, 1, 3]) self.assert_eq(pser.astype(cat_type), psser.astype(cat_type)) self.assertRaisesRegex( ValueError, "Cannot convert fractions with missing values to integer", lambda: self.float_withnan_psser.astype(int), ) self.assertRaisesRegex( ValueError, "Cannot convert fractions with missing values to integer", lambda: self.float_withnan_psser.astype(np.int32), ) self.assert_eq(self.float_withnan_psser.astype(str), self.float_withnan_psser.astype(str)) self.assert_eq(self.float_withnan_psser.astype(bool), self.float_withnan_psser.astype(bool)) self.assert_eq( self.float_withnan_psser.astype("category"), self.float_withnan_psser.astype("category") ) if extension_object_dtypes_available and extension_float_dtypes_available: pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype()) psser = ps.from_pandas(pser) self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype()))
def test_astype(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] for int_type in [int, np.int32, np.int16, np.int8]: if not pser.hasnans: self.assert_eq(pser.astype(int_type), psser.astype(int_type)) else: self.assertRaisesRegex( ValueError, "Cannot convert %s with missing " "values to integer" % psser._dtype_op.pretty_name, lambda: psser.astype(int_type), ) # TODO(SPARK-37039): the np.nan series.astype(bool) should be True if not pser.hasnans: self.assert_eq(pser.astype(bool), psser.astype(bool)) self.assert_eq(pser.astype(float), psser.astype(float)) self.assert_eq(pser.astype(np.float32), psser.astype(np.float32)) self.assert_eq(pser.astype(str), psser.astype(str)) self.assert_eq(pser.astype("category"), psser.astype("category")) cat_type = CategoricalDtype(categories=[2, 1, 3]) self.assert_eq(pser.astype(cat_type), psser.astype(cat_type)) if extension_object_dtypes_available and extension_float_dtypes_available: pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype()) psser = ps.from_pandas(pser) self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype()))
def _coerce_integers(series: pd.Series) -> pd.Series: """Method to parse integers for type coercion. Uses pandas.Int64Dtype() to allow missing values.""" return ( pd.to_numeric(series, errors="coerce") .astype(pd.Float64Dtype()) .astype(pd.Int64Dtype()) )
def test_astype_to_floating_array(): # astype to FloatingArray arr = pd.array([0.0, 1.0, None], dtype="Float64") result = arr.astype("Float64") tm.assert_extension_array_equal(result, arr) result = arr.astype(pd.Float64Dtype()) tm.assert_extension_array_equal(result, arr) result = arr.astype("Float32") expected = pd.array([0.0, 1.0, None], dtype="Float32") tm.assert_extension_array_equal(result, expected)
def _coerce_meta_fields(self, df: pd.DataFrame) -> pd.DataFrame: """ Method that coerces meta fields. Those fields are expected to be found in the DataFrame in a columnar shape. Thore are basically the station id and the date fields. Furthermore if the data is tidied parameter can be found as well as quality. For station id, parameter and quality those columns are additionally coerced to categories to reduce consumption of the DataFrame. :param df: pandas.DataFrame with the "fresh" data :return: pandas.DataFrame with meta fields being coerced """ df.loc[:, Columns.STATION_ID.value] = self._parse_station_id(df[Columns.STATION_ID.value]).astype("category") df.loc[:, Columns.DATASET.value] = self._coerce_strings(df[Columns.DATASET.value]).astype("category") if self.sr.stations.tidy: df.loc[:, Columns.PARAMETER.value] = self._coerce_strings(df[Columns.PARAMETER.value]).astype("category") df.loc[:, Columns.VALUE.value] = df[Columns.VALUE.value].astype(pd.Float64Dtype()).astype(float) df.loc[:, Columns.QUALITY.value] = df[Columns.QUALITY.value].astype(pd.Float64Dtype()).astype(float) return df
def _fix_int_dtypes(df: pd.DataFrame) -> None: """Mutate DataFrame to set dtypes for int columns containing NaN values.""" for col in df: if "float" in df[col].dtype.name and df[col].hasnans: # inspect values to determine if dtype of non-null values is int or float notna_series = df[col].dropna().values if np.equal(notna_series, notna_series.astype(int)).all(): # set to dtype that retains integers and supports NaNs df[col] = np.where(df[col].isnull(), None, df[col]) df[col] = df[col].astype(pd.Int64Dtype()) elif np.isclose(notna_series, notna_series.astype(int)).all(): # set to float dtype that retains floats and supports NaNs df[col] = np.where(df[col].isnull(), None, df[col]) df[col] = df[col].astype(pd.Float64Dtype())
def read( self, path, columns=None, use_nullable_dtypes=False, storage_options: StorageOptions = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} if use_nullable_dtypes: import pandas as pd mapping = { self.api.int8(): pd.Int8Dtype(), self.api.int16(): pd.Int16Dtype(), self.api.int32(): pd.Int32Dtype(), self.api.int64(): pd.Int64Dtype(), self.api.uint8(): pd.UInt8Dtype(), self.api.uint16(): pd.UInt16Dtype(), self.api.uint32(): pd.UInt32Dtype(), self.api.uint64(): pd.UInt64Dtype(), self.api.bool_(): pd.BooleanDtype(), self.api.string(): pd.StringDtype(), self.api.float32(): pd.Float32Dtype(), self.api.float64(): pd.Float64Dtype(), } to_pandas_kwargs["types_mapper"] = mapping.get manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), storage_options=storage_options, mode="rb", ) try: result = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs).to_pandas(**to_pandas_kwargs) if manager == "array": result = result._as_manager("array", copy=False) return result finally: if handles is not None: handles.close()
def test_numeric_nullable_dtypes(self): dtypes = [ pd.StringDtype(), pd.BooleanDtype(), pd.Float64Dtype(), pd.Float32Dtype(), pd.Int64Dtype(), pd.UInt64Dtype(), pd.Int32Dtype(), pd.UInt32Dtype(), pd.Int16Dtype(), pd.UInt16Dtype(), pd.Int8Dtype(), pd.UInt8Dtype(), pd.StringDtype(), ] # TODO: Re-add (".xml", "xml"), # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46 for suffix, fn in [ (".snappy", "parquet"), (".feather", "feather"), (".csv", "csv"), (".tsv", "tsv"), (".json", "json"), (".xlsx", "xlsx"), (".xls", "xls"), (".xlsb", "xlsb"), (".ods", "ods"), (".pickle", "pickle"), ]: # TODO: include xml for dtype in dtypes: with tmpfile(suffix) as path: try: df = Ind2Col2.convert( Ind2Col2( sample_data_ind2_col2_pd_na())).astype(dtype) assert list(df.index.names) == ["qqq", "rrr"] assert list(df.columns) == ["abc", "xyz"] getattr(df, "to_" + fn)(path) df2 = getattr(Ind2Col2, "read_" + fn)(path) assert list(df2.index.names) == ["qqq", "rrr"] assert list(df2.columns) == ["abc", "xyz"] except Exception: logger.error(f"Failed on path {path}, dtype {dtype}") raise
def test_replace_nullable_numeric(self): # GH#40732, GH#44940 floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype()) assert floats.replace({1.0: 9}).dtype == floats.dtype assert floats.replace(1.0, 9).dtype == floats.dtype assert floats.replace({1.0: 9.0}).dtype == floats.dtype assert floats.replace(1.0, 9.0).dtype == floats.dtype res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0]) assert res.dtype == floats.dtype ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype()) assert ints.replace({1: 9}).dtype == ints.dtype assert ints.replace(1, 9).dtype == ints.dtype assert ints.replace({1: 9.0}).dtype == ints.dtype assert ints.replace(1, 9.0).dtype == ints.dtype
def test_from_pandas(): dd_dict = { 'boolean': [True, True, False, None, True], 'text': ['This', 'is', 'some', 'text', 'so...'], 'text_missing': pd.Series(['Some', 'parts', None, 'missing', None], dtype='string'), 'float': [1, 30, -2, 1.5, 0.000], 'float_missing': [1, None, -2, 1.5, 0.000], 'float_missing_masked': pd.Series([1, None, -2, 1.5, 0.000], dtype=pd.Float64Dtype()), 'int_missing': pd.Series([1, None, 5, 1, 10], dtype='Int64'), 'datetime_1': [pd.NaT, datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1)], 'datetime_2': [pd.NaT, None, pd.NaT, pd.NaT, pd.NaT], 'datetime_3': [pd.Timedelta('1M'), pd.Timedelta('1D'), pd.Timedelta('100M'), pd.Timedelta('2D'), pd.Timedelta('1H')], 'datetime_4': [pd.Timestamp('2001-1-1 2:2:11'), pd.Timestamp('2001-12'), pd.Timestamp('2001-10-1'), pd.Timestamp('2001-03-1 2:2:11'), pd.Timestamp('2001-1-1 2:2:11')], 'datetime_5': [datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1)], 'datetime_6': [datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1)], } # Get pandas dataframe pandas_df = pd.DataFrame(dd_dict) pandas_df['datetime_7'] = pd.to_timedelta(pandas_df['datetime_2'] - pandas_df['datetime_1']) vaex_df = vaex.from_pandas(pandas_df) repr_value = repr(vaex_df) str_value = str(vaex_df) assert 'NaT' in repr_value assert 'NaT' in str_value assert '--' in repr_value assert '--' in str_value # string columns are now arrows arrays # assert vaex_df.text_missing.is_masked == True assert vaex_df.int_missing.is_masked == True assert vaex_df.float_missing.is_masked == False assert vaex_df.float_missing_masked.is_masked == True assert vaex_df.int_missing.tolist() == [1, None, 5, 1, 10] assert vaex_df.text_missing.tolist() == ['Some', 'parts', None, 'missing', None] assert vaex_df.float_missing.values[[0, 2, 3, 4]].tolist() == [1.0, -2.0, 1.5, 0.0] assert np.isnan(vaex_df.float_missing.values[1]) assert vaex_df.float_missing_masked.tolist() == [1.0, None, -2.0, 1.5, 0.0]
def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): if with_na: data[3] = pd.NA df = DataFrame({"key": ["a", "a", "b", "b"], "col": data}) grouped = df.groupby("key") if action == "always_int": # always Int64 expected_dtype = pd.Int64Dtype() elif action == "large_int": # for any int/bool use Int64, for float preserve dtype if is_float_dtype(data.dtype): expected_dtype = data.dtype elif is_integer_dtype(data.dtype): # match the numpy dtype we'd get with the non-nullable analogue expected_dtype = data.dtype else: expected_dtype = pd.Int64Dtype() elif action == "always_float": # for any int/bool use Float64, for float preserve dtype if is_float_dtype(data.dtype): expected_dtype = data.dtype else: expected_dtype = pd.Float64Dtype() elif action == "preserve": expected_dtype = data.dtype result = getattr(grouped, op_name)() assert result["col"].dtype == expected_dtype result = grouped.aggregate(op_name) assert result["col"].dtype == expected_dtype result = getattr(grouped["col"], op_name)() assert result.dtype == expected_dtype result = grouped["col"].aggregate(op_name) assert result.dtype == expected_dtype
def _fix_dtypes(df: pd.DataFrame) -> None: """ Mutate DataFrame to set dtypes for float columns containing NaN values. Set dtype of object to str to allow for downstream transformations. """ for col in df: if df[col].dtype.name == 'object': # if the type wasn't identified or converted, change it to a string so if can still be # processed. df[col] = df[col].astype(str) if "float" in df[col].dtype.name and df[col].hasnans: # inspect values to determine if dtype of non-null values is int or float notna_series = df[col].dropna().values if np.equal(notna_series, notna_series.astype(int)).all(): # set to dtype that retains integers and supports NaNs df[col] = np.where(df[col].isnull(), None, df[col]) df[col] = df[col].astype(pd.Int64Dtype()) elif np.isclose(notna_series, notna_series.astype(int)).all(): # set to float dtype that retains floats and supports NaNs df[col] = np.where(df[col].isnull(), None, df[col]) df[col] = df[col].astype(pd.Float64Dtype())
def test_replace_nullable_numeric(self): # GH#40732, GH#44940 floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype()) assert floats.replace({1.0: 9}).dtype == floats.dtype assert floats.replace(1.0, 9).dtype == floats.dtype assert floats.replace({1.0: 9.0}).dtype == floats.dtype assert floats.replace(1.0, 9.0).dtype == floats.dtype res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0]) assert res.dtype == floats.dtype ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype()) assert ints.replace({1: 9}).dtype == ints.dtype assert ints.replace(1, 9).dtype == ints.dtype assert ints.replace({1: 9.0}).dtype == ints.dtype assert ints.replace(1, 9.0).dtype == ints.dtype # nullable (for now) raises instead of casting with pytest.raises(TypeError, match="Invalid value"): ints.replace({1: 9.5}) with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5)
def test_to_table_nullable(self): boolean_array = pd.array([True, False, None], dtype=pd.BooleanDtype()) int8_array = pd.array([1, 2, None], dtype=pd.Int8Dtype()) int16_array = pd.array([1, 2, None], dtype=pd.Int16Dtype()) int32_array = pd.array([1, 2, None], dtype=pd.Int32Dtype()) int64_array = pd.array([1, 2, None], dtype=pd.Int64Dtype()) float_array = pd.array([1.1, 2.2, None], dtype=pd.Float32Dtype()) double_array = pd.array([1.1, 2.2, None], dtype=pd.Float64Dtype()) string_array = pd.array(["s11", "s22", None], dtype=pd.StringDtype()) object_array = pd.array([pd.NA, "s22", None], dtype=object) df = pd.DataFrame({ "NullableBoolean": boolean_array, "NullableInt8": int8_array, "NullableInt16": int16_array, "NullableInt32": int32_array, "NullableInt64": int64_array, "NullableFloat": float_array, "NullableDouble": double_array, "NullableString": string_array, "NullableObject": object_array, }) table = to_table(df) self.assertIs(table.columns[0].data_type, dtypes.bool_) self.assertIs(table.columns[1].data_type, dtypes.int8) self.assertIs(table.columns[2].data_type, dtypes.int16) self.assertIs(table.columns[3].data_type, dtypes.int32) self.assertIs(table.columns[4].data_type, dtypes.int64) self.assertIs(table.columns[5].data_type, dtypes.float32) self.assertIs(table.columns[6].data_type, dtypes.double) self.assertIs(table.columns[7].data_type, dtypes.string) self.assertIs(table.columns[8].data_type, dtypes.PyObject) self.assertEqual(table.size, 3) table_string = table.to_string() self.assertEqual(9, table_string.count("null"))
def run_one(agent1, agent2, game, testset, seed=None): sender = agent1 receiver = agent2 role_setting = 0 metrics = "episode role_setting images symbol guess success".split(" ") dtypes = [ pd.Int32Dtype(), bool, object, pd.Int32Dtype(), pd.Int32Dtype(), pd.Float64Dtype() ] test_log = pd.DataFrame(columns=metrics) for column, dtype in zip(metrics, dtypes): test_log[column] = test_log[column].astype(dtype) if seed is not None: set_seed(seed) episode = 0 exit_status = "full" error = False batch_log = {metric: [] for metric in metrics} for test in testset: episode += 1 game.reset() try: # Sender turn sender_ids = test["sender_ids"] sender_state = game.get_sender_state_from_ids(ids=sender_ids, expand=True) sender_probs = np.squeeze(sender.predict(state=sender_state)) sender_action = sender.choose_action(sender_probs) # Receiver turn receiver_ids = test["receiver_ids"] receiver_pos = test["receiver_pos"] receiver_state = game.get_receiver_state_from_ids(receiver_ids, receiver_pos, sender_action, expand=True) receiver_probs = np.squeeze(receiver.predict(state=receiver_state)) receiver_action = receiver.choose_action(receiver_probs) except Exception as e: print("\n", "ERROR", e) error = True break # Evaluate turn and remember sender_reward, receiver_reward, success = game.evaluate_guess( receiver_action) batch_log["episode"].append(episode) batch_log["role_setting"].append(role_setting) batch_log["images"].append(sender_ids) batch_log["symbol"].append(sender_action) batch_log["guess"].append(receiver_action) batch_log["success"].append(success) if not episode % 200: print(f"\r{episode} games played", end="") test_log = test_log.append(pd.DataFrame(batch_log)) if error: return test_log, "error" print() return test_log, exit_status
def add_freq_notes_df(sid, ssfs_df): real_freqs = {freq: freq * sid.freq_scaler for freq in ssfs_df['freq1'].unique() if pd.notna(freq)} closest_notes = {real_freq: closest_midi(real_freq)[1] for real_freq in real_freqs.values()} freq_map = [(freq, real_freq, closest_notes[real_freq]) for freq, real_freq in real_freqs.items()] freq_map.extend([(pd.NA, pd.NA, pd.NA)]) freq_notes_df = pd.DataFrame.from_records(freq_map, columns=['freq1', 'real_freq', 'closest_note']).astype(pd.Float64Dtype()) freq_notes_df['freq1'] = freq_notes_df['freq1'].astype(pd.UInt16Dtype()) freq_notes_df['closest_note'] = freq_notes_df['closest_note'].astype(pd.UInt8Dtype()) return set_sid_dtype(ssfs_df).merge(freq_notes_df, how='left', on='freq1')
def run_one(*, out_dir, dataset, number_of_images, embedding_size, vocabulary_size, sender_type, temperature, number_of_episodes, batch_size, analysis_window, optimizer, memory_sampling_mode, algorithm, max_memory, exploration_start, exploration_decay, exploration_floor, early_stopping_patience, early_stopping_minimum, role_mode, shared_embedding, shared_experience, seed, **kwargs): CHECKPOINT_EVERY = 1000 ERROR_PATIENCE = 5 # TODO: refactor into settings parser # LOAD DATASET loaded = False try: from utils.dataprep import load_emb_pickled metadata, embeddings = load_emb_pickled(dataset) filenames = metadata.get("fnames") categories = metadata.get("categories") loaded = True except FileNotFoundError: loaded = False if not loaded: from utils.dataprep import load_emb_gz, make_categories _, filenames, embeddings = load_emb_gz(dataset) categories = make_categories(filenames, sep="\\") image_shape = [len(embeddings[0])] # CREATE GAME game_settings = { "images": embeddings, "categories": categories, "images_filenames": filenames } from game import Game game = Game(**game_settings) # SET UP AGENTS learning_rate = 0.1 optimizers = { "adam": ( optim.Adam, { # "amsgrad": True, "clipnorm": 1.0 }), "sgd": (optim.SGD, { "clipnorm": 1.0 }), "adadelta": (optim.Adadelta, { "clipnorm": 1.0 }), "rmsprop": (optim.RMSprop, { "clipnorm": 1.0 }) } agent_settings = { "n_images": number_of_images, "input_image_shape": image_shape, "embedding_size": embedding_size, "vocabulary_size": vocabulary_size, "temperature": temperature, "optimizer": optimizers[optimizer][0](lr=learning_rate, **optimizers[optimizer][1]), "sender_type": sender_type, # "sender_type": "informed", # "n_informed_filters": 20, "max_memory": max_memory, "exploration_start": exploration_start, "exploration_decay": exploration_decay, "exploration_floor": exploration_floor } if role_mode != "switch": shared_experience = False tensorflow.keras.backend.clear_session() if algorithm == "reinforce": from agent.reinforce import Sender, Receiver, MultiAgent elif algorithm == "qlearning": from agent.qlearning import Sender, Receiver, MultiAgent else: raise ValueError( f"Expected 'reinforce' or 'qlearning' algorithm, got '{algorithm}'" ) if role_mode == "switch": agent1 = MultiAgent(active_role="sender", shared_embedding=shared_embedding, **agent_settings) agent2 = MultiAgent(active_role="receiver", shared_embedding=shared_embedding, **agent_settings) elif role_mode == "static": agent1 = Sender(**agent_settings) agent2 = Receiver(**agent_settings) else: raise ValueError( f"Role mode must be either 'static' or 'switch', not '{role_mode}'" ) metrics = "episode role_setting images symbol guess success sender_loss receiver_loss".split( " ") if shared_experience: metrics.extend(["sender_loss_2", "receiver_loss_2"]) dtypes = [ pd.Int32Dtype(), bool, object, pd.Int32Dtype(), pd.Int32Dtype(), pd.Float64Dtype(), pd.Float64Dtype(), pd.Float64Dtype() ] training_log = pd.DataFrame(columns=metrics) for column, dtype in zip(metrics, dtypes): training_log[column] = training_log[column].astype(dtype) episode = 0 early_stopping = EarlyStopping(patience=early_stopping_patience, min_episodes=early_stopping_minimum) set_seed(seed) sender = agent1 receiver = agent2 role_setting = 0 next_checkpoint_episode = CHECKPOINT_EVERY error_encountered = False remaining_errors = ERROR_PATIENCE exit_status = "full" while episode < number_of_episodes: batch_log = {metric: [] for metric in metrics} while True: episode += 1 if error_encountered: error_encountered = False try: print(f"Loading checkpoint") agent1.load(os.path.join(out_dir, "agent1")) agent2.load(os.path.join(out_dir, "agent2")) except: pass game.reset() try: # Sender turn sender_state, img_ids = game.get_sender_state( n_images=number_of_images, unique_categories=True, expand=True, return_ids=True) sender_probs = np.squeeze(sender.predict(state=sender_state)) sender_action = sender.choose_action(sender_probs) # Receiver turn receiver_state = game.get_receiver_state(sender_action, expand=True) receiver_probs = np.squeeze( receiver.predict(state=receiver_state)) receiver_action = receiver.choose_action(receiver_probs) except Exception as e: print("\n", e) error_encountered = True remaining_errors -= 1 if remaining_errors < 0: exit_status = "error" break continue # Evaluate turn and remember sender_reward, receiver_reward, success = game.evaluate_guess( receiver_action) sender.remember(state=sender_state, action=np.asarray([sender_action]), action_probs=sender_probs, reward=np.asarray([sender_reward])) receiver.remember(state=receiver_state, action=np.asarray([receiver_action]), action_probs=receiver_probs, reward=np.asarray([receiver_reward])) if shared_experience: receiver.components["sender"].remember( state=sender_state, action=np.asarray([sender_action]), action_probs=sender_probs, reward=np.asarray([sender_reward])) sender.components["receiver"].remember( state=receiver_state, action=np.asarray([receiver_action]), action_probs=receiver_probs, reward=np.asarray([receiver_reward])) batch_log["episode"].append(episode) batch_log["role_setting"].append(role_setting) batch_log["images"].append(img_ids) batch_log["symbol"].append(sender_action) batch_log["guess"].append(receiver_action) batch_log["success"].append(success) if not episode % 500: stats = compute_live_stats(training_log=training_log, analysis_window=500, overwrite_line=False) if early_stopping.check(episode, stats["mean_success"]): exit_status = "early" break if episode % batch_size == 0: break if exit_status == "error": break if exit_status == "early": break # Train on batch try: # Save before updating if episode > next_checkpoint_episode: agent1.save(os.path.join(out_dir, "agent1")) agent2.save(os.path.join(out_dir, "agent2")) next_checkpoint_episode += CHECKPOINT_EVERY # Update batch_log["sender_loss"] = sender.update_on_batch( batch_size, memory_sampling_mode=memory_sampling_mode) batch_log["receiver_loss"] = receiver.update_on_batch( batch_size, memory_sampling_mode=memory_sampling_mode) if shared_experience: batch_log["sender_loss_2"] = receiver.components[ "sender"].update_on_batch( batch_size, memory_sampling_mode=memory_sampling_mode) batch_log["receiver_loss_2"] = sender.components[ "receiver"].update_on_batch( batch_size, memory_sampling_mode=memory_sampling_mode) training_log = training_log.append(pd.DataFrame(batch_log)) except Exception as e: print("\n", e) return training_log, "error" stats = compute_live_stats(training_log=training_log, analysis_window=analysis_window) if role_mode == "switch": sender.switch_role() receiver.switch_role() sender, receiver = receiver, sender role_setting ^= 1 print() if exit_status != "error": agent1.save(os.path.join(out_dir, "agent1")) agent2.save(os.path.join(out_dir, "agent2")) return training_log, exit_status
def test_uses_pandas_na(): a = pd.array([1, None], dtype=pd.Float64Dtype()) assert a[1] is pd.NA
pandas_dtypes_to_cudf_dtypes = { pd.UInt8Dtype(): np.dtype("uint8"), pd.UInt16Dtype(): np.dtype("uint16"), pd.UInt32Dtype(): np.dtype("uint32"), pd.UInt64Dtype(): np.dtype("uint64"), pd.Int8Dtype(): np.dtype("int8"), pd.Int16Dtype(): np.dtype("int16"), pd.Int32Dtype(): np.dtype("int32"), pd.Int64Dtype(): np.dtype("int64"), pd.BooleanDtype(): np.dtype("bool_"), pd.StringDtype(): np.dtype("object"), } if PANDAS_GE_120: cudf_dtypes_to_pandas_dtypes[np.dtype("float32")] = pd.Float32Dtype() cudf_dtypes_to_pandas_dtypes[np.dtype("float64")] = pd.Float64Dtype() pandas_dtypes_to_cudf_dtypes[pd.Float32Dtype()] = np.dtype("float32") pandas_dtypes_to_cudf_dtypes[pd.Float64Dtype()] = np.dtype("float64") SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"} UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"} INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES FLOAT_TYPES = {"float32", "float64"} SIGNED_TYPES = SIGNED_INTEGER_TYPES | FLOAT_TYPES NUMERIC_TYPES = SIGNED_TYPES | UNSIGNED_TYPES DATETIME_TYPES = { "datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]", }
def test_digitize(): # np.nan and pd.NA get digitized to -1, suffix should be added df = pd.DataFrame( [["chr1", 0, 10, np.nan]], columns=["chrom", "start", "end", "value"], ) digitized = saddle.digitize(df, 10, vrange=(-1, 1), digitized_suffix=".test")[0] assert -1 == digitized["value.test"].values df = pd.DataFrame( [["chr1", 0, 10, pd.NA]], columns=["chrom", "start", "end", "value"], ).astype({"value": pd.Float64Dtype()}) digitized = saddle.digitize(df, 10, vrange=(-1, 1), digitized_suffix=".test")[0] assert -1 == digitized["value.test"].values n_bins = 10 digitized = saddle.digitize(df, n_bins, vrange=(-1, 1))[0] # the dtype of the returned column should be a categorical assert type(digitized["value.d"].dtype) is pd.core.dtypes.dtypes.CategoricalDtype # the number of categories should be equal to the number of bins +3 assert (n_bins + 3) == digitized["value.d"].dtype.categories.shape[0] df = pd.DataFrame( [ ["chr1", 0, 10, -0.5], ["chr1", 10, 20, 0.5], ], columns=["chrom", "start", "end", "value"], ) # values out of the range should be in the 0 and n+1 bins digitized = saddle.digitize(df, n_bins, vrange=(-0.1, 0.1))[0] assert 0 == digitized["value.d"].values[0] assert (n_bins + 1) == digitized["value.d"].values[1] # for an input dataframe of ten elements between -1 and 1, # and 5 bins, each bin should have 2 digitized values # this test will need an update after input checking df_linspace = pd.DataFrame( (np.linspace(-1, 1, 10) * np.ones((4,))[:, None]).T, columns=["chrom", "start", "end", "value"], ) p = (np.arange(0, 100, 10) * np.ones((2,))[:, None]).T # .shape p[:, 1] += 10 df_linspace.iloc[:, 1:3] = p df_linspace["chrom"] = "chrX" df_linspace = df_linspace.astype({"chrom": "str", "start": int, "end": int}) x = saddle.digitize(df_linspace, 5, vrange=(-1, 1.001),)[ 0 ]["value.d"] assert (2 == np.histogram(x, np.arange(1, 7))[0]).all() # if the bottom and top quantiles are 25 and 75 with 3 bins, then # the low outlier and high outlier bins should each have 3 values x = saddle.digitize(df_linspace, 1, qrange=(0.25, 0.75),)[ 0 ]["value.d"] assert 3 == np.sum(x == 0) assert 3 == np.sum(x == 2) # bins[-1] max value should remain in bin N, # not get pushed to outlier bin. # raises error if not provided with a track # (i.e. bedframe with a numeric fourth column) df_not_track = pd.DataFrame( [["chr1", 20, 40, "non-numeric"]], columns=["chrom", "start", "end", "value"], ) with pytest.raises(ValueError): saddle.digitize(df_not_track, n_bins, vrange=(0, 2)) df_not_track = pd.DataFrame( [[0, 20, 40, 0]], columns=["chrom", "start", "end", "value"], ) with pytest.raises(ValueError): saddle.digitize(df_not_track, n_bins, vrange=(0, 2)) # raises error if both or none of vrange, qrange provided with pytest.raises(ValueError): saddle.digitize(df, n_bins, vrange=(0, 2), qrange=(0.1, 0.9)) with pytest.raises(ValueError): saddle.digitize(df, n_bins, vrange=None, qrange=None) # raises error if vrange lo>hi, qrange lo >hi, or qrange out of (0,1) with pytest.raises(ValueError): saddle.digitize(df, n_bins, vrange=(2, 1)) with pytest.raises(ValueError): saddle.digitize(df, n_bins, qrange=(0, 2.1)) with pytest.raises(ValueError): saddle.digitize(df, n_bins, qrange=(0.5, 0.25))
class FLOAT64(DataType, dtypes.Float): """Semantic representation of a :class:`pandas.Float64Dtype`.""" type = pd.Float64Dtype() bit_width: int = 64
pd.UInt32Dtype(): SqlTypeName.INTEGER, np.uint16: SqlTypeName.SMALLINT, pd.UInt16Dtype(): SqlTypeName.SMALLINT, np.uint8: SqlTypeName.TINYINT, pd.UInt8Dtype(): SqlTypeName.TINYINT, np.bool8: SqlTypeName.BOOLEAN, pd.BooleanDtype(): SqlTypeName.BOOLEAN, np.object_: SqlTypeName.VARCHAR, pd.StringDtype(): SqlTypeName.VARCHAR, np.datetime64: SqlTypeName.TIMESTAMP, } if FLOAT_NAN_IMPLEMENTED: # pragma: no cover _PYTHON_TO_SQL.update({ pd.Float32Dtype(): SqlTypeName.FLOAT, pd.Float64Dtype(): SqlTypeName.FLOAT }) # Default mapping between SQL types and python types # for values _SQL_TO_PYTHON_SCALARS = { "DOUBLE": np.float64, "FLOAT": np.float32, "DECIMAL": np.float32, "BIGINT": np.int64, "INTEGER": np.int32, "SMALLINT": np.int16, "TINYINT": np.int8, "BOOLEAN": np.bool8, "VARCHAR": str, "CHAR": str,
bit_width: int = 8 # ############################################################################### # # float # ############################################################################### _register_numpy_numbers( builtin_name="float", pandera_name="Float", sizes=[128, 64, 32, 16] if FLOAT_128_AVAILABLE else [64, 32, 16], ) if PANDAS_1_2_0_PLUS: @Engine.register_dtype(equivalents=[pd.Float64Dtype, pd.Float64Dtype()]) @immutable class FLOAT64(DataType, dtypes.Float): """Semantic representation of a :class:`pandas.Float64Dtype`.""" type = pd.Float64Dtype() bit_width: int = 64 @Engine.register_dtype(equivalents=[pd.Float32Dtype, pd.Float32Dtype()]) @immutable class FLOAT32(FLOAT64): """Semantic representation of a :class:`pandas.Float32Dtype`.""" type = pd.Float32Dtype() bit_width: int = 32
class DataMapping: """ Map primary data between different supported data frameworks, preserving equivalent data types. DataMapping is for primary data, to map metadata types and values use :py:class:`TypeMapping <tracdap.rt.impl.type_system.TypeMapping>` and :py:class:`TypeMapping <tracdap.rt.impl.type_system.MetadataCodec>`. """ __log = _util.logger_for_namespace(_DataInternal.__module__ + ".DataMapping") # Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data __TRAC_DECIMAL_PRECISION = 38 __TRAC_DECIMAL_SCALE = 12 __TRAC_TIMESTAMP_UNIT = "ms" __TRAC_TIMESTAMP_ZONE = None __TRAC_TO_ARROW_BASIC_TYPE_MAPPING = { _meta.BasicType.BOOLEAN: pa.bool_(), _meta.BasicType.INTEGER: pa.int64(), _meta.BasicType.FLOAT: pa.float64(), _meta.BasicType.DECIMAL: pa.decimal128(__TRAC_DECIMAL_PRECISION, __TRAC_DECIMAL_SCALE), _meta.BasicType.STRING: pa.utf8(), _meta.BasicType.DATE: pa.date32(), _meta.BasicType.DATETIME: pa.timestamp(__TRAC_TIMESTAMP_UNIT, __TRAC_TIMESTAMP_ZONE) } # Check the Pandas dtypes for handling floats are available before setting up the type mapping __PANDAS_FLOAT_DTYPE_CHECK = _DataInternal.float_dtype_check() __PANDAS_DATETIME_TYPE = pd.to_datetime([]).dtype # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way __ARROW_TO_PANDAS_TYPE_MAPPING = { pa.bool_(): pd.BooleanDtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.int32(): pd.Int32Dtype(), pa.int64(): pd.Int64Dtype(), pa.uint8(): pd.UInt8Dtype(), pa.uint16(): pd.UInt16Dtype(), pa.uint32(): pd.UInt32Dtype(), pa.uint64(): pd.UInt64Dtype(), pa.float16(): pd.Float32Dtype(), pa.float32(): pd.Float32Dtype(), pa.float64(): pd.Float64Dtype(), pa.utf8(): pd.StringDtype() } @staticmethod def arrow_to_python_type(arrow_type: pa.DataType) -> type: if pa.types.is_boolean(arrow_type): return bool if pa.types.is_integer(arrow_type): return int if pa.types.is_floating(arrow_type): return float if pa.types.is_decimal(arrow_type): return decimal.Decimal if pa.types.is_string(arrow_type): return str if pa.types.is_date(arrow_type): return dt.date if pa.types.is_timestamp(arrow_type): return dt.datetime raise _ex.ETracInternal( f"No Python type mapping available for Arrow type [{arrow_type}]") @classmethod def python_to_arrow_type(cls, python_type: type) -> pa.DataType: if python_type == bool: return pa.bool_() if python_type == int: return pa.int64() if python_type == float: return pa.float64() if python_type == decimal.Decimal: return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION, cls.__TRAC_DECIMAL_SCALE) if python_type == str: return pa.utf8() if python_type == dt.date: return pa.date32() if python_type == dt.datetime: return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT, cls.__TRAC_TIMESTAMP_ZONE) raise _ex.ETracInternal( f"No Arrow type mapping available for Python type [{python_type}]") @classmethod def trac_to_arrow_type(cls, trac_type: _meta.TypeDescriptor) -> pa.DataType: return cls.trac_to_arrow_basic_type(trac_type.basicType) @classmethod def trac_to_arrow_basic_type( cls, trac_basic_type: _meta.BasicType) -> pa.DataType: arrow_type = cls.__TRAC_TO_ARROW_BASIC_TYPE_MAPPING.get( trac_basic_type) if arrow_type is None: raise _ex.ETracInternal( f"No Arrow type mapping available for TRAC type [{trac_basic_type}]" ) return arrow_type @classmethod def trac_to_arrow_schema(cls, trac_schema: _meta.SchemaDefinition) -> pa.Schema: if trac_schema.schemaType != _meta.SchemaType.TABLE: raise _ex.ETracInternal( f"Schema type [{trac_schema.schemaType}] cannot be converted for Apache Arrow" ) arrow_fields = [(f.fieldName, cls.trac_to_arrow_basic_type(f.fieldType)) for f in trac_schema.table.fields] return pa.schema(arrow_fields, metadata={}) @classmethod def trac_arrow_decimal_type(cls) -> pa.Decimal128Type: return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION, cls.__TRAC_DECIMAL_SCALE) @classmethod def pandas_datetime_type(cls): return cls.__PANDAS_DATETIME_TYPE @classmethod def view_to_pandas(cls, view: DataView, part: DataPartKey) -> pd.DataFrame: deltas = view.parts.get(part) # Sanity checks if not view.arrow_schema: raise _ex.ETracInternal(f"Data view schema not set") if not deltas: raise _ex.ETracInternal( f"Data view for part [{part.opaque_key}] does not contain any items" ) if len(deltas) == 1: return cls.item_to_pandas(deltas[0]) batches = { batch for delta in deltas for batch in ( delta.batches if delta.batches else delta.table.to_batches()) } table = pa.Table.from_batches(batches) # noqa return table.to_pandas() @classmethod def item_to_pandas(cls, item: DataItem) -> pd.DataFrame: if item.pandas is not None: return item.pandas.copy() if item.table is not None: return cls.arrow_to_pandas(item.table) if item.batches is not None: table = pa.Table.from_batches(item.batches, item.schema) # noqa return cls.arrow_to_pandas(table) raise _ex.ETracInternal(f"Data item does not contain any usable data") @classmethod def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame: return table.to_pandas( ignore_metadata=True, # noqa date_as_object=False, # noqa timestamp_as_object=False, # noqa types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get) @classmethod def pandas_to_view(cls, df: pd.DataFrame, prior_view: DataView, part: DataPartKey): item = cls.pandas_to_item(df, prior_view.arrow_schema) return cls.add_item_to_view(prior_view, part, item) @classmethod def pandas_to_item(cls, df: pd.DataFrame, schema: tp.Optional[pa.Schema]) -> DataItem: table = cls.pandas_to_arrow(df, schema) return DataItem(table.schema, table) @classmethod def pandas_to_arrow(cls, df: pd.DataFrame, schema: tp.Optional[pa.Schema] = None) -> pa.Table: # Here we convert the whole Pandas df and then pass it to conformance # An optimization would be to filter columns before applying conformance # To do this, we'd need the case-insensitive field matching logic, including output of warnings # Also, note that schema is not applied in from_pandas # This is because the conformance logic allows for a wider range of conversions # Applying the schema directly would fail for some types where casting is possible if len(df) == 0: df_schema = pa.Schema.from_pandas(df, preserve_index=False) # noqa table = pa.Table.from_batches(list(), df_schema) # noqa else: table = pa.Table.from_pandas(df, preserve_index=False) # noqa # If there is no explict schema, give back the table exactly as it was received from Pandas # There could be an option here to coerce types to the appropriate TRAC standard types # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type if schema is None: return table else: return DataConformance.conform_to_schema(table, schema, df.dtypes) @classmethod def add_item_to_view(cls, view: DataView, part: DataPartKey, item: DataItem) -> DataView: prior_deltas = view.parts.get(part) or list() deltas = [*prior_deltas, item] parts = {**view.parts, part: deltas} return DataView(view.trac_schema, view.arrow_schema, parts)