def pyarrow2pandas_extension( # pylint: disable=too-many-branches,too-many-return-statements dtype: pa.DataType ) -> Optional[pd.api.extensions.ExtensionDtype]: """Pyarrow to Pandas data types conversion.""" if pa.types.is_int8(dtype): # pragma: no cover return pd.Int8Dtype() if pa.types.is_int16(dtype): # pragma: no cover return pd.Int16Dtype() if pa.types.is_int32(dtype): return pd.Int32Dtype() if pa.types.is_int64(dtype): return pd.Int64Dtype() if pa.types.is_boolean(dtype): return pd.BooleanDtype() if pa.types.is_string(dtype): return pd.StringDtype() return None
def test_pandas_extension_types(): """Test pandas extension data type happy path.""" # pylint: disable=no-member test_params = [ ( pd.CategoricalDtype(), pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None, ), ( pd.DatetimeTZDtype(tz="UTC"), pd.Series( pd.date_range(start="20200101", end="20200301"), dtype="datetime64[ns, utc]", ), None, ), (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None), ( pd.StringDtype(), pd.Series(["foo", "bar", "baz"], dtype="string"), None, ), ( pd.PeriodDtype(freq="D"), pd.Series(pd.period_range("1/1/2019", "1/1/2020", freq="D")), None, ), ( pd.SparseDtype("float"), pd.Series(range(100)) .where(lambda s: s < 5, other=np.nan) .astype("Sparse[float]"), {"nullable": True}, ), (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None), ( pd.IntervalDtype(subtype="int64"), pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])), None, ), ] for dtype, data, series_kwargs in test_params: series_kwargs = {} if series_kwargs is None else series_kwargs series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs) assert isinstance(series_schema.validate(data), pd.Series)
def read( self, path, columns=None, use_nullable_dtypes=False, storage_options: StorageOptions = None, **kwargs, ): kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} if use_nullable_dtypes: if LooseVersion(self.api.__version__) >= "0.16": import pandas as pd mapping = { self.api.int8(): pd.Int8Dtype(), self.api.int16(): pd.Int16Dtype(), self.api.int32(): pd.Int32Dtype(), self.api.int64(): pd.Int64Dtype(), self.api.uint8(): pd.UInt8Dtype(), self.api.uint16(): pd.UInt16Dtype(), self.api.uint32(): pd.UInt32Dtype(), self.api.uint64(): pd.UInt64Dtype(), self.api.bool_(): pd.BooleanDtype(), self.api.string(): pd.StringDtype(), } to_pandas_kwargs["types_mapper"] = mapping.get else: raise ValueError( "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 " f"({self.api.__version__} is installed") path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), storage_options=storage_options, mode="rb", ) try: return self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs).to_pandas(**to_pandas_kwargs) finally: if handles is not None: handles.close()
def ownership(eia860_dfs, eia860_transformed_dfs): """ Pulls and transforms the ownership table. Args: eia860_dfs (dict): Each entry in this dictionary of DataFrame objects corresponds to a page from the EIA860 form, as reported in the Excel spreadsheets they distribute eia860_transformed_dfs (dict): A dictionary of DataFrame objects in which pages from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values) Returns: dict: eia860_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values) """ o_df = (eia860_dfs['ownership'].copy().pipe(pudl.helpers.fix_eia_na).pipe( pudl.helpers.convert_to_date)) # The fix we're making here is only known to be valid for 2011 -- if we # get older data... then we need to to revisit the cleaning function and # make sure it also applies to those earlier years. if min(o_df.report_date.dt.year) < min(pc.working_years["eia860"]): raise ValueError( f"EIA 860 transform step is only known to work for " f"year {min(pc.working_years['eia860'])} and later, but found data " f"from year {min(o_df.report_date.dt.year)}.") # Prior to 2012, ownership was reported as a percentage, rather than # as a proportion, so we need to divide those values by 100. o_df.loc[o_df.report_date.dt.year < 2012, 'fraction_owned'] = \ o_df.loc[o_df.report_date.dt.year < 2012, 'fraction_owned'] / 100 o_df = (o_df.astype({ "owner_utility_id_eia": pd.Int64Dtype(), "utility_id_eia": pd.Int64Dtype(), "plant_id_eia": pd.Int64Dtype(), "owner_state": pd.StringDtype() })) eia860_transformed_dfs['ownership_eia860'] = o_df return eia860_transformed_dfs
def load_score(loc): """ Load data from CLCCR scores provided by Ted Leonard Parameters ---------- loc : str location of data dump Returns ------- DataFrame of CLCCR csv """ ccr_dtype_dict = {'Email': pd.StringDtype(), 'CLCCR Score': float} ccr_csv = pd.read_csv(loc, dtype=ccr_dtype_dict, parse_dates=['Lead DateTime']) print('Loading CLCCR Scores Complete') return ccr_csv
def test_coerce_field_types_with_nans(): df = pd.DataFrame({ "QN": [pd.NA, np.nan, "1"], "RS_IND_01": [pd.NA, np.nan, "1"], "V_VV_I": [pd.NA, np.nan, "P"], }) expected_df = pd.DataFrame({ "QN": pd.Series([pd.NA, np.nan, 1], dtype=pd.Int64Dtype()), "RS_IND_01": pd.Series([pd.NA, np.nan, 1], dtype=pd.Int64Dtype()), "V_VV_I": pd.Series([pd.NA, np.nan, "P"], dtype=pd.StringDtype()), }) df = coerce_field_types(df, DWDObservationResolution.HOURLY) assert_frame_equal(df, expected_df)
def enumerate_ask(X, e, result): # Initialization of distribution of target variable X Q = pd.Series([], dtype=pd.StringDtype()) # For each possible value x that X can have for x in PsychDrug[X].unique(): # extend evidence e with value of X e = copy.deepcopy(e) e[X] = x # sort variables in topological order variables = topological_order(result, X) # Calculate enumeration prob = 1 while len(variables) > 0: prob = enumerate_all(result, variables, e) * prob variables.pop(0) Q[str(x)] = prob # Normalization Q = Q / sum(Q) return Q
def test_colonne_col_tipo(self): frutti = [ 'Anguria', 'Pompelmo', 'Fragole', 'Nespole', 'Lamponi', 'Pesca', 'Melone', 'More' ] calorie = pd.Series([16, 26, 27, 28, 34, 27, 33, 36]) colore = pd.Series([ 'rosso', 'rosa', 'rosso', 'arancione', 'rosso', 'arancione', 'arancione', 'nero' ], dtype=pd.StringDtype()) df = pd.DataFrame(data={ 'frutti': frutti, 'calorie': calorie, 'colore': colore, }) print(df.info())
def test_coerce_field_types(): """ Test coercion of fields """ # Special cases # We require a stations object with hourly resolution in order to accurately parse # the hourly timestamp (pandas would fail parsing it because it has a strange # format) request = DwdObservationRequest( parameter=DwdObservationDataset.SOLAR, # RS_IND_01, resolution=DwdObservationResolution.HOURLY, period=DwdObservationPeriod.RECENT, humanize=False, tidy=False, ).all() # Here we don't query the actual data because it tales too long # we rather use a predefined DataFrame to check for coercion df = pd.DataFrame( { "station_id": ["00001"], "date": ["1970010100"], "qn": ["1"], "rs_ind_01": [1], "end_of_interval": ["1970010100:00"], "v_vv_i": ["p"], } ) df = request.values._coerce_date_fields(df) df = request.values._coerce_meta_fields(df) df = request.values._coerce_parameter_types(df) expected_df = pd.DataFrame( { "station_id": pd.Categorical(["00001"]), "date": [pd.Timestamp("1970-01-01").tz_localize("utc")], "qn": pd.Series([1], dtype=pd.Int64Dtype()), "rs_ind_01": pd.Series([1], dtype=pd.Int64Dtype()), "end_of_interval": [pd.Timestamp("1970-01-01")], "v_vv_i": pd.Series(["p"], dtype=pd.StringDtype()), } ) assert_frame_equal(df, expected_df)
def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): if pd_dtype == pd.StringDtype(): data = ["a", pd.NA, "c", pd.NA, "e"] elif pd_dtype == pd.BooleanDtype(): data = [True, pd.NA, False, pd.NA, True] else: data = [1, pd.NA, 3, pd.NA, 5] pd_data = pd.Series(data, dtype=pd_dtype) gd_data = cudf.Series.from_pandas(pd_data) assert gd_data.dtype == expect_dtype # check mask expect_mask = [True if x is not pd.NA else False for x in pd_data] got_mask = mask_to_bools(gd_data._column.base_mask, 0, len(gd_data)).to_array() np.testing.assert_array_equal(expect_mask, got_mask)
def test_serialize_sliced_string(): # https://github.com/rapidsai/cudf/issues/7735 data = ["hi", "hello", None] pd_series = pd.Series(data, dtype=pd.StringDtype()) gd_series = cudf.Series(data, dtype="str") sliced = gd_series[0:3] serialized_gd_series = gd_series.serialize() serialized_sliced = sliced.serialize() # validate frames are equal or not # because both should be identical for i in range(3): assert_eq( serialized_gd_series[1][i].to_host_array(), serialized_sliced[1][i].to_host_array(), ) recreated = cudf.Series.deserialize(*sliced.serialize()) assert_eq(recreated.to_pandas(nullable=True), pd_series)
def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype): if pd_dtype == pd.StringDtype(): data = ["a", pd.NA, "c", pd.NA, "e"] elif pd_dtype == pd.BooleanDtype(): data = [True, pd.NA, False, pd.NA, True] else: data = [1, pd.NA, 3, pd.NA, 5] pd_data = pd.DataFrame.from_dict({"a": data}, dtype=pd_dtype) gd_data = cudf.DataFrame.from_pandas(pd_data) assert gd_data["a"].dtype == expect_dtype # check mask expect_mask = [True if x is not pd.NA else False for x in pd_data["a"]] got_mask = mask_to_bools( gd_data["a"]._column.base_mask, 0, len(gd_data) ).values_host np.testing.assert_array_equal(expect_mask, got_mask)
def covid_data(granularity): """Read a coronavirus data file. Note: Impure Function Reasons: Non-deterministic: The return value relys on file contents, which can change arbitrarily. Args: granularity (<enum "Granularity">): The granularity of the data file. Returns: pd.DataFrame: The contents of the data file, with correct dtypes for "date" and "fips" columns. """ return pipe( granularity, _covid_filepath, curry(pd.read_csv, parse_dates=["date"], dtype={"fips": pd.StringDtype()}))
def electricity_planning_areas(pudl_settings): """Electric Planning Area geometries from HIFLD.""" gdb_path = pathlib.Path( pudl_settings["data_dir"], "local/hifld/electric_planning_areas.gdb" ) gdf = ( geopandas.read_file(gdb_path) .assign( SOURCEDATE=lambda x: pd.to_datetime(x.SOURCEDATE), VAL_DATE=lambda x: pd.to_datetime(x.VAL_DATE), ID=lambda x: pd.to_numeric(x.ID), NAICS_CODE=lambda x: pd.to_numeric(x.NAICS_CODE), YEAR=lambda x: pd.to_numeric(x.YEAR), ) # Hack to work around geopanda issue fixed as of v0.8.0 # https://github.com/geopandas/geopandas/issues/1366 .assign( ID=lambda x: x.ID.astype(pd.Int64Dtype()), NAME=lambda x: x.NAME.astype(pd.StringDtype()), COUNTRY=lambda x: x.COUNTRY.astype(pd.StringDtype()), NAICS_CODE=lambda x: x.NAICS_CODE.astype(pd.Int64Dtype()), NAICS_DESC=lambda x: x.NAICS_DESC.astype(pd.StringDtype()), SOURCE=lambda x: x.SOURCE.astype(pd.StringDtype()), VAL_METHOD=lambda x: x.VAL_METHOD.astype(pd.StringDtype()), WEBSITE=lambda x: x.WEBSITE.astype(pd.StringDtype()), ABBRV=lambda x: x.ABBRV.astype(pd.StringDtype()), YEAR=lambda x: x.YEAR.astype(pd.Int64Dtype()), PEAK_LOAD=lambda x: x.PEAK_LOAD.astype(float), PEAK_RANGE=lambda x: x.PEAK_RANGE.astype(float), SHAPE_Length=lambda x: x.SHAPE_Length.astype(float), SHAPE_Area=lambda x: x.SHAPE_Area.astype(float), ) ) # Need to set these IDs b/c HIFLD geometry uses EIA Balancing Authority IDs # (maybe?) FERC 714 is using EIA Utility IDs. This isn't totally resolved # and we need to figure out which set of IDs is getting used where. gdf.loc[gdf.ID == 2775, "ID"] = 229 # CAISO gdf.loc[gdf.ID == 59504, "ID"] = 17690 # Southwest Power Pool gdf.loc[gdf.ID == 14379, "ID"] = 14354 # PacifiCorp East + West gdf.loc[gdf.ID == 13670, "ID"] = 39347 # Northeast TX Electric Co-op return gdf
def sql_to_python_type(sql_type: str) -> type: """Turn an SQL type into a dataframe dtype""" if (sql_type.startswith("CHAR(") or sql_type.startswith("VARCHAR(") or sql_type == "VARCHAR"): return pd.StringDtype() elif sql_type.startswith("INTERVAL"): return np.dtype("<m8[ns]") elif (sql_type.startswith("TIMESTAMP(") or sql_type.startswith("TIME(") or sql_type == "DATE"): return np.dtype("<M8[ns]") elif sql_type.startswith("DECIMAL("): # We use np.float64 always, even though we might # be able to use a smaller type return np.float64 else: try: return _SQL_TO_PYTHON_FRAMES[sql_type] except KeyError: # pragma: no cover raise NotImplementedError( f"The SQL type {sql_type} is not implemented (yet)")
def write_results(n_tiles=4, custom_name=None): print(f"Writing for {n_tiles}x{n_tiles}") folder = f"report_{n_tiles}" if custom_name: folder = custom_name datafile = f"results/{folder}.csv" C = [] for pred in os.listdir(res_model_folder): # if pred.is_dir(): continue name = os.path.splitext(pred)[0] print('name', name) s = pd.Series([], dtype=pd.StringDtype()) s.name = name gt = open_mask(f"{mask_folder}/{name}.png") pred_img = open_mask(os.path.join(res_model_folder, pred)) pred_img = clean_mask(pred_img) # plt.imshow(pred_img) s["# cysts"], s["# detected"], s["# missed"], s[ "# wrong"] = missed_wrong_cysts(gt, pred_img) s["# recall"] = s["# detected"] / (s["# detected"] + s["# missed"] + 0.0001) s["iou"] = jaccard_score(gt, pred_img, average=avg) s["precision"] = precision_score(gt, pred_img, average=avg, zero_division=1) s["recall"] = recall_score(gt, pred_img, average=avg, zero_division=1) # s['f1_score'] = f1_score(gt, pred_img, average=avg, zero_division=1) # fp fn recall C.append(s) print(C) df_bench = pd.DataFrame(C).to_csv(datafile) # json.dump(df_bench, open(datafile, "w")) return
def coerce_field_types(df: pd.DataFrame, time_resolution: TimeResolution) -> pd.DataFrame: """ A function used to create a unique dtype mapping for a given list of column names. This function is needed as we want to ensure the expected dtypes of the returned DataFrame as well as for mapping data after reading it from a stored .h5 file. This is required as we want to store the data in this file with the same format which is a string, thus after reading data back in the dtypes have to be matched. Args: df: the station_data gathered in a pandas.DataFrame time_resolution: time resolution of the data as enumeration Return: station data with converted dtypes """ for column in df.columns: # Station ids are handled separately as they are expected to not have any nans if column == DWDMetaColumns.STATION_ID.value: df[column] = df[column].astype(int) elif column in DATE_FIELDS_REGULAR: df[column] = pd.to_datetime( df[column], format=TIME_RESOLUTION_TO_DATETIME_FORMAT_MAPPING[ time_resolution], ) elif column in DATE_FIELDS_IRREGULAR: df[column] = pd.to_datetime( df[column], format=DatetimeFormat.YMDH_COLUMN_M.value) elif column in QUALITY_FIELDS or column in INTEGER_FIELDS: df[column] = pd.to_numeric(df[column], errors="coerce").astype(pd.Int64Dtype()) elif column in STRING_FIELDS: df[column] = df[column].astype(pd.StringDtype()) else: df[column] = df[column].astype(float) return df
def sql_to_python_type(sql_type: str) -> type: """Turn an SQL type into a dataframe dtype""" if sql_type.startswith("CHAR(") or sql_type.startswith("VARCHAR("): return pd.StringDtype() elif sql_type.startswith("INTERVAL"): return np.dtype("<m8[ns]") elif sql_type.startswith("TIMESTAMP(") or sql_type.startswith("TIME("): return np.dtype("<M8[ns]") elif sql_type.startswith("TIMESTAMP_WITH_LOCAL_TIME_ZONE("): # Everything is converted to UTC # So far, this did not break return pd.DatetimeTZDtype(unit="ns", tz="UTC") elif sql_type.startswith("DECIMAL("): # We use np.float64 always, even though we might # be able to use a smaller type return np.float64 else: try: return _SQL_TO_PYTHON_FRAMES[sql_type] except KeyError: # pragma: no cover raise NotImplementedError( f"The SQL type {sql_type} is not implemented (yet)")
def load_payments(loc, start_date, end_date): """ Load data from plm_payments.csv Parameters ---------- loc : str location of data dump Returns ------- DataFrame of payments csv with selected columns """ payments_dtype_dict = { 'Loan ID': 'Int64', 'Store': pd.StringDtype(), 'Type': pd.StringDtype(), 'Amount': float, 'Reversed': bool, 'ACH Return Code': pd.StringDtype(), 'Days Late': 'Int64', 'Processed By Admin': pd.StringDtype(), 'Reversed By Admin': pd.StringDtype(), 'Comments': pd.StringDtype(), 'Origin': pd.StringDtype() } payments_dates = ['Date', 'New Due Date', 'Cleared Date'] payments_csv = pd.read_csv(loc + '/plm_payments.csv', dtype=payments_dtype_dict, usecols=list(payments_dtype_dict.keys()) + payments_dates, parse_dates=payments_dates) # load all payments from period payments_csv = payments_csv[(payments_csv['Date'] > start_date) & (payments_csv['Date'] <= end_date)] payments_csv.drop( payments_csv[(payments_csv['Store'] == 'Test Store')].index, inplace=True) payments_csv['Date Only'] = payments_csv['Date'].dt.date print('Loading Payments CSV Complete') return payments_csv.sort_values(['Loan ID', 'Date'])
def get_dummy_deltas(employees_path): employees = pd.read_csv(employees_path, nrows=10) # Create deleted flag employees["record_deleted"] = False employees["record_deleted"] = employees["record_deleted"].astype( pd.BooleanDtype()) # Cast to new int cols for col in ["employee_id", "department_id", "manager_id"]: employees[col] = employees[col].astype(pd.Int64Dtype()) # Cast to new str cols for col in ["sex", "forename", "surname"]: employees[col] = employees[col].astype(pd.StringDtype()) # Let's split up the data and make some changes day1 = employees[employees.employee_id.isin([1, 2, 3, 4, 5])].reset_index(drop=True) day2 = employees[employees.employee_id.isin([5, 6, 7])].reset_index(drop=True) day2.loc[0, "department_id"] = 2 day2.loc[0, "manager_id"] = 18 day3 = employees[employees.employee_id.isin([1, 7, 9, 10, 11])].reset_index(drop=True) day3.department_id = 2 day3.manager_id = 5 # Reset this persons values for clarity day3.loc[0, "record_deleted"] = True day3.loc[0, "department_id"] = 1 day3.loc[0, "manager_id"] = 17 deltas = {"day1": day1, "day2": day2, "day3": day3} return deltas
def test_concatenazione(self): import datetime clienti = pd.Series(['paco', 'vittoria', 'francesco'], dtype=pd.StringDtype()) indirizzi = [np.nan, 'via 1', 'piaza 2'] eta = [16, 56, 98] clienti_df = pd.DataFrame({ 'cliente': clienti, 'indirizzo': indirizzi, 'eta': eta }) nome = ['uno', 'altro', 'non so'] data = [ datetime.datetime.now(), datetime.datetime.now(), datetime.datetime(year=2020, month=3, day=23) ] motivi_df = pd.DataFrame({ 'nome': nome, 'data': data, 'usuario': clienti }) df_col = pd.concat([motivi_df, clienti_df], axis=1) print("Axis1: \n", df_col) df_row = pd.concat([motivi_df, clienti_df], axis=0) print("Axis0: \n", df_row) print( "merge per i clienti/usuari:\n", pd.merge(clienti_df, motivi_df, left_on='cliente', right_on='usuario', left_index=True)) print("join:\n", clienti_df.join(motivi_df))
def prepareSchema(df_covid, df_popul, country_selected): """ Removes, renames columns and changes datatypes for necessary dataframes """ # Rename columns df_covid.rename(columns={"Country/Region": "Country"}, inplace=True) df_covid.rename(columns={"Province/State": "State"}, inplace=True) df_popul.rename(columns={"Country Name": "Country"}, inplace=True) df_popul.rename(columns={"Year": "Date"}, inplace=True) # Drop columns df_covid.drop(columns=['State', "Lat", "Long"], inplace=True, errors="ignore") df_popul.drop(columns=['Country Code'], inplace=True, errors="ignore") # Drop nan values df_covid.dropna(subset=['Confirmed', "Recovered", "Deaths"], inplace=True) # Drop all countries not selected if len(country_selected) == 0: country_selected = list(df_covid["Country"].unique()) df_covid = df_covid[df_covid["Country"].isin(country_selected)].copy() # Convert columns to proper datatypes df_covid["Date"] = df_covid["Date"].apply(pd.to_datetime) df_covid["Country"] = df_covid["Country"].astype(pd.StringDtype()) df_covid["Confirmed"] = df_covid["Confirmed"].astype(np.int) df_covid["Recovered"] = df_covid["Recovered"].astype(np.int) df_covid["Deaths"] = df_covid["Deaths"].astype(np.int) df_popul['Date'] = pd.to_datetime(df_popul['Date'], format="%Y") df_popul["Value"] = df_popul["Value"].astype(np.int) # Group values by Country with different regions df_covid = df_covid.groupby(["Date", "Country"], as_index=False).sum() return df_covid, df_popul
def test_coerce_field_types(): df = pd.DataFrame({ "QN": ["1"], "RS_IND_01": ["1"], "DATE": ["1970010100"], "END_OF_INTERVAL": ["1970010100:00"], "V_VV_I": ["P"], }) expected_df = pd.DataFrame({ "QN": pd.Series([1], dtype=pd.Int64Dtype()), "RS_IND_01": pd.Series([1], dtype=pd.Int64Dtype()), "DATE": [pd.Timestamp("1970-01-01")], "END_OF_INTERVAL": [pd.Timestamp("1970-01-01")], "V_VV_I": pd.Series(["P"], dtype=pd.StringDtype()), }) df = coerce_field_types(df, DWDObservationResolution.HOURLY) assert_frame_equal(df, expected_df)
def pd_to_json( df: pd.DataFrame, output_file: Union[IO, str], orient: str = "records", lines: bool = True, **kwargs, ): """Export a dataframe to a json newlines file this package can open identically. Converts period data types to datetime strings with precision to the second. Args: df (pd.DataFrame): a pandas dataframe output_file (IO or str): the path you want to export to index (bool): standard pandas .to_json index argument, but defaulting to False orient (str): standard pandas .to_json orient argument, defaulting to 'records' lines (bool): standard pandas .to_json lines argument, defaulting to True **kwargs: any other keyword arguments to pass to pandas .to_json """ new = df.copy() # Convert date-related columns to strings Arrow can read consistently for col in new.columns: if pd.api.types.is_period_dtype(new[col]): new[col] = new[col].dt.strftime("%Y-%m-%d %H:%M:%S") elif any([ pd.api.types.is_datetime64_any_dtype(new[col]), isinstance( new[col][new[col].notnull()].iloc[0], (datetime.datetime, datetime.date), ), ]): new[col] = new[col].astype(pd.StringDtype()) # Convert pd_timestamp string 'NaT' to NaN so PyArrow can read them new[col].replace("NaT", np.nan, regex=False, inplace=True) new.to_json(output_file, orient=orient, lines=lines, **kwargs)
def test_df_pdv1_types(): pdv1_test_mapping = { 'int8col': { 'vals': [1, 2, 3], 'pd_type': pd.Int8Dtype() }, 'int16col': { 'vals': [1, 2, 3], 'pd_type': pd.Int16Dtype() }, 'int32col': { 'vals': [1, 2, 3], 'pd_type': pd.Int32Dtype() }, 'int64col': { 'vals': [1, 2, 3], 'pd_type': pd.Int64Dtype() }, 'stringcol': { 'vals': ['one', 'two', 'three'], 'pd_type': pd.StringDtype() }, 'boolcol': { 'vals': [True, False, True], 'pd_type': pd.BooleanDtype() } } pdv1_df = pd.DataFrame({ col_name: col_meta['vals'] for col_name, col_meta in pdv1_test_mapping.items() }) pdv1_df = pdv1_df.astype({ col_name: col_meta['pd_type'] for col_name, col_meta in pdv1_test_mapping.items() }) return pdv1_df
def zero_pad_zips(zip_series, n_digits): """ Retain prefix zeros in zipcodes. Args: zip_series (pd.Series) : series containing the zipcode values. n_digits(int) : zipcode length (likely 4 or 5 digits). Returns: pandas.Series: a series containing zipcodes with their prefix zeros intact and invalid zipcodes rendered as na. """ # Add preceeding zeros where necessary and get rid of decimal zeros def get_rid_of_decimal(series): return series.str.replace(r'[\.]+\d*', '', regex=True) zip_series = ( zip_series.astype(pd.StringDtype()).replace('nan', np.nan).fillna( "0").pipe(get_rid_of_decimal).str.zfill(n_digits).replace( {n_digits * "0": pd.NA}) # All-zero Zip codes aren't valid. ) return zip_series
def test_coerce_field_types_with_nans(): df = pd.DataFrame({ "QN": [pd.NA, np.nan, "1"], "RS_IND_01": [pd.NA, np.nan, "1"], "V_VV_I": [pd.NA, np.nan, "P"], }) expected_df = pd.DataFrame({ "QN": pd.Series([pd.NA, np.nan, 1], dtype=pd.Int64Dtype()), "RS_IND_01": pd.Series([pd.NA, np.nan, 1], dtype=pd.Int64Dtype()), "V_VV_I": pd.Series([pd.NA, np.nan, "P"], dtype=pd.StringDtype()), }) def __init__(self): self.tidy_data = False with mock.patch.object(DWDObservationData, "__init__", new=__init__): df = DWDObservationData()._coerce_parameter_types(df) assert_frame_equal(df, expected_df)
def load_payments(loc): """ Load data from plm_payments.csv Parameters ---------- loc : str location of data dump Returns ------- DataFrame of payments csv with selected columns """ payments_dtype_dict = {'Loan ID': 'Int64', 'Type': pd.StringDtype(), 'Amount': float, 'Reversed': bool, 'Other Fee Paid': float, 'Late Fee Paid': float, 'Nsf Fee Paid': float, 'CSO/Setup Fee Paid': float, 'Back Interest Paid': float, 'Interest Paid': float, 'Principal Paid': float, 'Days Late': 'Int64', 'Comments': pd.StringDtype()} payments_dates = ['Date', 'New Due Date', 'Cleared Date'] payments_csv = pd.read_csv(loc + '/plm_payments.csv', dtype=payments_dtype_dict, usecols=list(payments_dtype_dict.keys()) + payments_dates, parse_dates=payments_dates) print('Loading Payments CSV Complete') return payments_csv.sort_values('Loan ID')
def test_to_table_nullable(self): boolean_array = pd.array([True, False, None], dtype=pd.BooleanDtype()) int8_array = pd.array([1, 2, None], dtype=pd.Int8Dtype()) int16_array = pd.array([1, 2, None], dtype=pd.Int16Dtype()) int32_array = pd.array([1, 2, None], dtype=pd.Int32Dtype()) int64_array = pd.array([1, 2, None], dtype=pd.Int64Dtype()) float_array = pd.array([1.1, 2.2, None], dtype=pd.Float32Dtype()) double_array = pd.array([1.1, 2.2, None], dtype=pd.Float64Dtype()) string_array = pd.array(["s11", "s22", None], dtype=pd.StringDtype()) object_array = pd.array([pd.NA, "s22", None], dtype=object) df = pd.DataFrame({ "NullableBoolean": boolean_array, "NullableInt8": int8_array, "NullableInt16": int16_array, "NullableInt32": int32_array, "NullableInt64": int64_array, "NullableFloat": float_array, "NullableDouble": double_array, "NullableString": string_array, "NullableObject": object_array, }) table = to_table(df) self.assertIs(table.columns[0].data_type, dtypes.bool_) self.assertIs(table.columns[1].data_type, dtypes.int8) self.assertIs(table.columns[2].data_type, dtypes.int16) self.assertIs(table.columns[3].data_type, dtypes.int32) self.assertIs(table.columns[4].data_type, dtypes.int64) self.assertIs(table.columns[5].data_type, dtypes.float32) self.assertIs(table.columns[6].data_type, dtypes.double) self.assertIs(table.columns[7].data_type, dtypes.string) self.assertIs(table.columns[8].data_type, dtypes.PyObject) self.assertEqual(table.size, 3) table_string = table.to_string() self.assertEqual(9, table_string.count("null"))
def getDataset(): # On charge le fichier dataset = pd.read_csv('default-cards.csv', header=0, sep=';') # On affecte le bon type à chaque colonnes dataset.name = dataset.name.astype(pd.StringDtype()) dataset.color = dataset.color.astype(pd.StringDtype()) dataset.cmc = dataset.cmc.astype(pd.Int64Dtype()) dataset.set = dataset.set.astype(pd.StringDtype()) dataset.rarity = dataset.rarity.astype( pd.CategoricalDtype( categories=['common', 'uncommon', 'rare', 'mythic'], ordered=True)) dataset.text = dataset.text.astype(pd.StringDtype()) dataset.power = dataset.power.astype('float').astype(pd.Int64Dtype()) dataset.toughness = dataset.toughness.astype('float').astype( pd.Int64Dtype()) dataset.price_usd = dataset.price_usd.astype('float64') dataset.price_eur = dataset.price_eur.astype('float64') dataset.type = dataset.type.astype(pd.StringDtype()) dataset.subtype = dataset.subtype.astype(pd.StringDtype()) return dataset