def pyarrow2pandas_extension(  # pylint: disable=too-many-branches,too-many-return-statements
    dtype: pa.DataType
) -> Optional[pd.api.extensions.ExtensionDtype]:
    """Pyarrow to Pandas data types conversion."""
    if pa.types.is_int8(dtype):  # pragma: no cover
        return pd.Int8Dtype()
    if pa.types.is_int16(dtype):  # pragma: no cover
        return pd.Int16Dtype()
    if pa.types.is_int32(dtype):
        return pd.Int32Dtype()
    if pa.types.is_int64(dtype):
        return pd.Int64Dtype()
    if pa.types.is_boolean(dtype):
        return pd.BooleanDtype()
    if pa.types.is_string(dtype):
        return pd.StringDtype()
    return None
Exemple #2
0
def test_pandas_extension_types():
    """Test pandas extension data type happy path."""
    # pylint: disable=no-member
    test_params = [
        (
            pd.CategoricalDtype(),
            pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"),
            None,
        ),
        (
            pd.DatetimeTZDtype(tz="UTC"),
            pd.Series(
                pd.date_range(start="20200101", end="20200301"),
                dtype="datetime64[ns, utc]",
            ),
            None,
        ),
        (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None),
        (
            pd.StringDtype(),
            pd.Series(["foo", "bar", "baz"], dtype="string"),
            None,
        ),
        (
            pd.PeriodDtype(freq="D"),
            pd.Series(pd.period_range("1/1/2019", "1/1/2020", freq="D")),
            None,
        ),
        (
            pd.SparseDtype("float"),
            pd.Series(range(100))
            .where(lambda s: s < 5, other=np.nan)
            .astype("Sparse[float]"),
            {"nullable": True},
        ),
        (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None),
        (
            pd.IntervalDtype(subtype="int64"),
            pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])),
            None,
        ),
    ]
    for dtype, data, series_kwargs in test_params:
        series_kwargs = {} if series_kwargs is None else series_kwargs
        series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs)
        assert isinstance(series_schema.validate(data), pd.Series)
Exemple #3
0
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ):
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            if LooseVersion(self.api.__version__) >= "0.16":
                import pandas as pd

                mapping = {
                    self.api.int8(): pd.Int8Dtype(),
                    self.api.int16(): pd.Int16Dtype(),
                    self.api.int32(): pd.Int32Dtype(),
                    self.api.int64(): pd.Int64Dtype(),
                    self.api.uint8(): pd.UInt8Dtype(),
                    self.api.uint16(): pd.UInt16Dtype(),
                    self.api.uint32(): pd.UInt32Dtype(),
                    self.api.uint64(): pd.UInt64Dtype(),
                    self.api.bool_(): pd.BooleanDtype(),
                    self.api.string(): pd.StringDtype(),
                }
                to_pandas_kwargs["types_mapper"] = mapping.get
            else:
                raise ValueError(
                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
                    f"({self.api.__version__} is installed")

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            return self.api.parquet.read_table(
                path_or_handle, columns=columns,
                **kwargs).to_pandas(**to_pandas_kwargs)
        finally:
            if handles is not None:
                handles.close()
Exemple #4
0
def ownership(eia860_dfs, eia860_transformed_dfs):
    """
    Pulls and transforms the ownership table.

    Args:
        eia860_dfs (dict): Each entry in this dictionary of DataFrame objects
            corresponds to a page from the EIA860 form, as reported in the
            Excel spreadsheets they distribute
        eia860_transformed_dfs (dict): A dictionary of DataFrame objects in
            which pages from EIA860 form (keys) correspond to normalized
            DataFrames of values from that page (values)

    Returns:
        dict: eia860_transformed_dfs, a dictionary of DataFrame objects in
        which pages from EIA860 form (keys) correspond to normalized
        DataFrames of values from that page (values)

    """
    o_df = (eia860_dfs['ownership'].copy().pipe(pudl.helpers.fix_eia_na).pipe(
        pudl.helpers.convert_to_date))

    # The fix we're making here is only known to be valid for 2011 -- if we
    # get older data... then we need to to revisit the cleaning function and
    # make sure it also applies to those earlier years.
    if min(o_df.report_date.dt.year) < min(pc.working_years["eia860"]):
        raise ValueError(
            f"EIA 860 transform step is only known to work for "
            f"year {min(pc.working_years['eia860'])} and later, but found data "
            f"from year {min(o_df.report_date.dt.year)}.")

    # Prior to 2012, ownership was reported as a percentage, rather than
    # as a proportion, so we need to divide those values by 100.
    o_df.loc[o_df.report_date.dt.year < 2012, 'fraction_owned'] = \
        o_df.loc[o_df.report_date.dt.year < 2012, 'fraction_owned'] / 100

    o_df = (o_df.astype({
        "owner_utility_id_eia": pd.Int64Dtype(),
        "utility_id_eia": pd.Int64Dtype(),
        "plant_id_eia": pd.Int64Dtype(),
        "owner_state": pd.StringDtype()
    }))

    eia860_transformed_dfs['ownership_eia860'] = o_df

    return eia860_transformed_dfs
Exemple #5
0
def load_score(loc):
    """ Load data from CLCCR scores provided by Ted Leonard

        Parameters
        ----------
        loc : str
            location of data dump

        Returns
        -------
        DataFrame of CLCCR csv
    """
    ccr_dtype_dict = {'Email': pd.StringDtype(), 'CLCCR Score': float}
    ccr_csv = pd.read_csv(loc,
                          dtype=ccr_dtype_dict,
                          parse_dates=['Lead DateTime'])
    print('Loading CLCCR Scores Complete')
    return ccr_csv
Exemple #6
0
def test_coerce_field_types_with_nans():
    df = pd.DataFrame({
        "QN": [pd.NA, np.nan, "1"],
        "RS_IND_01": [pd.NA, np.nan, "1"],
        "V_VV_I": [pd.NA, np.nan, "P"],
    })

    expected_df = pd.DataFrame({
        "QN":
        pd.Series([pd.NA, np.nan, 1], dtype=pd.Int64Dtype()),
        "RS_IND_01":
        pd.Series([pd.NA, np.nan, 1], dtype=pd.Int64Dtype()),
        "V_VV_I":
        pd.Series([pd.NA, np.nan, "P"], dtype=pd.StringDtype()),
    })

    df = coerce_field_types(df, DWDObservationResolution.HOURLY)
    assert_frame_equal(df, expected_df)
def enumerate_ask(X, e, result):
    # Initialization of distribution of target variable X
    Q = pd.Series([], dtype=pd.StringDtype())
    # For each possible value x that X can have
    for x in PsychDrug[X].unique():
        # extend evidence e with value of X
        e = copy.deepcopy(e)
        e[X] = x
        # sort variables in topological order
        variables = topological_order(result, X)
        # Calculate enumeration
        prob = 1
        while len(variables) > 0:
            prob = enumerate_all(result, variables, e) * prob
            variables.pop(0)
        Q[str(x)] = prob
    # Normalization
    Q = Q / sum(Q)
    return Q
Exemple #8
0
    def test_colonne_col_tipo(self):
        frutti = [
            'Anguria', 'Pompelmo', 'Fragole', 'Nespole', 'Lamponi', 'Pesca',
            'Melone', 'More'
        ]
        calorie = pd.Series([16, 26, 27, 28, 34, 27, 33, 36])
        colore = pd.Series([
            'rosso', 'rosa', 'rosso', 'arancione', 'rosso', 'arancione',
            'arancione', 'nero'
        ],
                           dtype=pd.StringDtype())

        df = pd.DataFrame(data={
            'frutti': frutti,
            'calorie': calorie,
            'colore': colore,
        })

        print(df.info())
Exemple #9
0
def test_coerce_field_types():
    """ Test coercion of fields """
    # Special cases
    # We require a stations object with hourly resolution in order to accurately parse
    # the hourly timestamp (pandas would fail parsing it because it has a strange
    # format)
    request = DwdObservationRequest(
        parameter=DwdObservationDataset.SOLAR,  # RS_IND_01,
        resolution=DwdObservationResolution.HOURLY,
        period=DwdObservationPeriod.RECENT,
        humanize=False,
        tidy=False,
    ).all()

    # Here we don't query the actual data because it tales too long
    # we rather use a predefined DataFrame to check for coercion
    df = pd.DataFrame(
        {
            "station_id": ["00001"],
            "date": ["1970010100"],
            "qn": ["1"],
            "rs_ind_01": [1],
            "end_of_interval": ["1970010100:00"],
            "v_vv_i": ["p"],
        }
    )

    df = request.values._coerce_date_fields(df)
    df = request.values._coerce_meta_fields(df)
    df = request.values._coerce_parameter_types(df)

    expected_df = pd.DataFrame(
        {
            "station_id": pd.Categorical(["00001"]),
            "date": [pd.Timestamp("1970-01-01").tz_localize("utc")],
            "qn": pd.Series([1], dtype=pd.Int64Dtype()),
            "rs_ind_01": pd.Series([1], dtype=pd.Int64Dtype()),
            "end_of_interval": [pd.Timestamp("1970-01-01")],
            "v_vv_i": pd.Series(["p"], dtype=pd.StringDtype()),
        }
    )

    assert_frame_equal(df, expected_df)
Exemple #10
0
def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
    if pd_dtype == pd.StringDtype():
        data = ["a", pd.NA, "c", pd.NA, "e"]
    elif pd_dtype == pd.BooleanDtype():
        data = [True, pd.NA, False, pd.NA, True]
    else:
        data = [1, pd.NA, 3, pd.NA, 5]

    pd_data = pd.Series(data, dtype=pd_dtype)
    gd_data = cudf.Series.from_pandas(pd_data)

    assert gd_data.dtype == expect_dtype

    # check mask
    expect_mask = [True if x is not pd.NA else False for x in pd_data]
    got_mask = mask_to_bools(gd_data._column.base_mask, 0,
                             len(gd_data)).to_array()

    np.testing.assert_array_equal(expect_mask, got_mask)
Exemple #11
0
def test_serialize_sliced_string():
    # https://github.com/rapidsai/cudf/issues/7735
    data = ["hi", "hello", None]
    pd_series = pd.Series(data, dtype=pd.StringDtype())
    gd_series = cudf.Series(data, dtype="str")
    sliced = gd_series[0:3]
    serialized_gd_series = gd_series.serialize()
    serialized_sliced = sliced.serialize()

    # validate frames are equal or not
    # because both should be identical
    for i in range(3):
        assert_eq(
            serialized_gd_series[1][i].to_host_array(),
            serialized_sliced[1][i].to_host_array(),
        )

    recreated = cudf.Series.deserialize(*sliced.serialize())
    assert_eq(recreated.to_pandas(nullable=True), pd_series)
Exemple #12
0
def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
    if pd_dtype == pd.StringDtype():
        data = ["a", pd.NA, "c", pd.NA, "e"]
    elif pd_dtype == pd.BooleanDtype():
        data = [True, pd.NA, False, pd.NA, True]
    else:
        data = [1, pd.NA, 3, pd.NA, 5]

    pd_data = pd.DataFrame.from_dict({"a": data}, dtype=pd_dtype)
    gd_data = cudf.DataFrame.from_pandas(pd_data)

    assert gd_data["a"].dtype == expect_dtype

    # check mask
    expect_mask = [True if x is not pd.NA else False for x in pd_data["a"]]
    got_mask = mask_to_bools(
        gd_data["a"]._column.base_mask, 0, len(gd_data)
    ).values_host

    np.testing.assert_array_equal(expect_mask, got_mask)
def covid_data(granularity):
    """Read a coronavirus data file.

    Note: Impure Function
    Reasons:
        Non-deterministic: The return value relys on file contents, which can
            change arbitrarily.

    Args:
        granularity (<enum "Granularity">): The granularity of the data file.

    Returns:
        pd.DataFrame: The contents of the data file, with correct dtypes for
            "date" and "fips" columns.
    """
    return pipe(
        granularity, _covid_filepath,
        curry(pd.read_csv,
              parse_dates=["date"],
              dtype={"fips": pd.StringDtype()}))
Exemple #14
0
def electricity_planning_areas(pudl_settings):
    """Electric Planning Area geometries from HIFLD."""
    gdb_path = pathlib.Path(
        pudl_settings["data_dir"],
        "local/hifld/electric_planning_areas.gdb"
    )

    gdf = (
        geopandas.read_file(gdb_path)
        .assign(
            SOURCEDATE=lambda x: pd.to_datetime(x.SOURCEDATE),
            VAL_DATE=lambda x: pd.to_datetime(x.VAL_DATE),
            ID=lambda x: pd.to_numeric(x.ID),
            NAICS_CODE=lambda x: pd.to_numeric(x.NAICS_CODE),
            YEAR=lambda x: pd.to_numeric(x.YEAR),
        )
        # Hack to work around geopanda issue fixed as of v0.8.0
        # https://github.com/geopandas/geopandas/issues/1366
        .assign(
            ID=lambda x: x.ID.astype(pd.Int64Dtype()),
            NAME=lambda x: x.NAME.astype(pd.StringDtype()),
            COUNTRY=lambda x: x.COUNTRY.astype(pd.StringDtype()),
            NAICS_CODE=lambda x: x.NAICS_CODE.astype(pd.Int64Dtype()),
            NAICS_DESC=lambda x: x.NAICS_DESC.astype(pd.StringDtype()),
            SOURCE=lambda x: x.SOURCE.astype(pd.StringDtype()),
            VAL_METHOD=lambda x: x.VAL_METHOD.astype(pd.StringDtype()),
            WEBSITE=lambda x: x.WEBSITE.astype(pd.StringDtype()),
            ABBRV=lambda x: x.ABBRV.astype(pd.StringDtype()),
            YEAR=lambda x: x.YEAR.astype(pd.Int64Dtype()),
            PEAK_LOAD=lambda x: x.PEAK_LOAD.astype(float),
            PEAK_RANGE=lambda x: x.PEAK_RANGE.astype(float),
            SHAPE_Length=lambda x: x.SHAPE_Length.astype(float),
            SHAPE_Area=lambda x: x.SHAPE_Area.astype(float),
        )
    )
    # Need to set these IDs b/c HIFLD geometry uses EIA Balancing Authority IDs
    # (maybe?) FERC 714 is using EIA Utility IDs. This isn't totally resolved
    # and we need to figure out which set of IDs is getting used where.
    gdf.loc[gdf.ID == 2775, "ID"] = 229  # CAISO
    gdf.loc[gdf.ID == 59504, "ID"] = 17690  # Southwest Power Pool
    gdf.loc[gdf.ID == 14379, "ID"] = 14354  # PacifiCorp East + West
    gdf.loc[gdf.ID == 13670, "ID"] = 39347  # Northeast TX Electric Co-op
    return gdf
Exemple #15
0
def sql_to_python_type(sql_type: str) -> type:
    """Turn an SQL type into a dataframe dtype"""
    if (sql_type.startswith("CHAR(") or sql_type.startswith("VARCHAR(")
            or sql_type == "VARCHAR"):
        return pd.StringDtype()
    elif sql_type.startswith("INTERVAL"):
        return np.dtype("<m8[ns]")
    elif (sql_type.startswith("TIMESTAMP(") or sql_type.startswith("TIME(")
          or sql_type == "DATE"):
        return np.dtype("<M8[ns]")

    elif sql_type.startswith("DECIMAL("):
        # We use np.float64 always, even though we might
        # be able to use a smaller type
        return np.float64
    else:
        try:
            return _SQL_TO_PYTHON_FRAMES[sql_type]
        except KeyError:  # pragma: no cover
            raise NotImplementedError(
                f"The SQL type {sql_type} is not implemented (yet)")
Exemple #16
0
def write_results(n_tiles=4, custom_name=None):
    print(f"Writing for {n_tiles}x{n_tiles}")
    folder = f"report_{n_tiles}"
    if custom_name:
        folder = custom_name
    datafile = f"results/{folder}.csv"
    C = []
    for pred in os.listdir(res_model_folder):
        # if pred.is_dir(): continue
        name = os.path.splitext(pred)[0]
        print('name', name)
        s = pd.Series([], dtype=pd.StringDtype())
        s.name = name

        gt = open_mask(f"{mask_folder}/{name}.png")

        pred_img = open_mask(os.path.join(res_model_folder, pred))
        pred_img = clean_mask(pred_img)

        #     plt.imshow(pred_img)

        s["# cysts"], s["# detected"], s["# missed"], s[
            "# wrong"] = missed_wrong_cysts(gt, pred_img)
        s["# recall"] = s["# detected"] / (s["# detected"] + s["# missed"] +
                                           0.0001)
        s["iou"] = jaccard_score(gt, pred_img, average=avg)
        s["precision"] = precision_score(gt,
                                         pred_img,
                                         average=avg,
                                         zero_division=1)
        s["recall"] = recall_score(gt, pred_img, average=avg, zero_division=1)
        #     s['f1_score'] = f1_score(gt, pred_img, average=avg, zero_division=1)
        # fp fn recall
        C.append(s)

    print(C)

    df_bench = pd.DataFrame(C).to_csv(datafile)
    # json.dump(df_bench, open(datafile, "w"))
    return
Exemple #17
0
def coerce_field_types(df: pd.DataFrame,
                       time_resolution: TimeResolution) -> pd.DataFrame:
    """
    A function used to create a unique dtype mapping for a given list of column names.
    This function is needed as we want to ensure the expected dtypes of the returned
    DataFrame as well as for mapping data after reading it from a stored .h5 file. This
    is required as we want to store the data in this file with the same format which is
    a string, thus after reading data back in the dtypes have to be matched.

    Args:
        df: the station_data gathered in a pandas.DataFrame
        time_resolution: time resolution of the data as enumeration
    Return:
         station data with converted dtypes
    """

    for column in df.columns:

        # Station ids are handled separately as they are expected to not have any nans
        if column == DWDMetaColumns.STATION_ID.value:
            df[column] = df[column].astype(int)
        elif column in DATE_FIELDS_REGULAR:
            df[column] = pd.to_datetime(
                df[column],
                format=TIME_RESOLUTION_TO_DATETIME_FORMAT_MAPPING[
                    time_resolution],
            )
        elif column in DATE_FIELDS_IRREGULAR:
            df[column] = pd.to_datetime(
                df[column], format=DatetimeFormat.YMDH_COLUMN_M.value)
        elif column in QUALITY_FIELDS or column in INTEGER_FIELDS:
            df[column] = pd.to_numeric(df[column],
                                       errors="coerce").astype(pd.Int64Dtype())
        elif column in STRING_FIELDS:
            df[column] = df[column].astype(pd.StringDtype())
        else:
            df[column] = df[column].astype(float)

    return df
Exemple #18
0
def sql_to_python_type(sql_type: str) -> type:
    """Turn an SQL type into a dataframe dtype"""
    if sql_type.startswith("CHAR(") or sql_type.startswith("VARCHAR("):
        return pd.StringDtype()
    elif sql_type.startswith("INTERVAL"):
        return np.dtype("<m8[ns]")
    elif sql_type.startswith("TIMESTAMP(") or sql_type.startswith("TIME("):
        return np.dtype("<M8[ns]")
    elif sql_type.startswith("TIMESTAMP_WITH_LOCAL_TIME_ZONE("):
        # Everything is converted to UTC
        # So far, this did not break
        return pd.DatetimeTZDtype(unit="ns", tz="UTC")
    elif sql_type.startswith("DECIMAL("):
        # We use np.float64 always, even though we might
        # be able to use a smaller type
        return np.float64
    else:
        try:
            return _SQL_TO_PYTHON_FRAMES[sql_type]
        except KeyError:  # pragma: no cover
            raise NotImplementedError(
                f"The SQL type {sql_type} is not implemented (yet)")
Exemple #19
0
def load_payments(loc, start_date, end_date):
    """ Load data from plm_payments.csv

        Parameters
        ----------
        loc : str
            location of data dump

        Returns
        -------
        DataFrame of payments csv with selected columns
    """
    payments_dtype_dict = {
        'Loan ID': 'Int64',
        'Store': pd.StringDtype(),
        'Type': pd.StringDtype(),
        'Amount': float,
        'Reversed': bool,
        'ACH Return Code': pd.StringDtype(),
        'Days Late': 'Int64',
        'Processed By Admin': pd.StringDtype(),
        'Reversed By Admin': pd.StringDtype(),
        'Comments': pd.StringDtype(),
        'Origin': pd.StringDtype()
    }
    payments_dates = ['Date', 'New Due Date', 'Cleared Date']

    payments_csv = pd.read_csv(loc + '/plm_payments.csv',
                               dtype=payments_dtype_dict,
                               usecols=list(payments_dtype_dict.keys()) +
                               payments_dates,
                               parse_dates=payments_dates)
    # load all payments from period
    payments_csv = payments_csv[(payments_csv['Date'] > start_date)
                                & (payments_csv['Date'] <= end_date)]
    payments_csv.drop(
        payments_csv[(payments_csv['Store'] == 'Test Store')].index,
        inplace=True)
    payments_csv['Date Only'] = payments_csv['Date'].dt.date
    print('Loading Payments CSV Complete')
    return payments_csv.sort_values(['Loan ID', 'Date'])
def get_dummy_deltas(employees_path):
    employees = pd.read_csv(employees_path, nrows=10)

    # Create deleted flag
    employees["record_deleted"] = False
    employees["record_deleted"] = employees["record_deleted"].astype(
        pd.BooleanDtype())

    # Cast to new int cols
    for col in ["employee_id", "department_id", "manager_id"]:
        employees[col] = employees[col].astype(pd.Int64Dtype())

    # Cast to new str cols
    for col in ["sex", "forename", "surname"]:
        employees[col] = employees[col].astype(pd.StringDtype())

    # Let's split up the data and make some changes
    day1 = employees[employees.employee_id.isin([1, 2, 3, 4,
                                                 5])].reset_index(drop=True)

    day2 = employees[employees.employee_id.isin([5, 6,
                                                 7])].reset_index(drop=True)
    day2.loc[0, "department_id"] = 2
    day2.loc[0, "manager_id"] = 18

    day3 = employees[employees.employee_id.isin([1, 7, 9, 10,
                                                 11])].reset_index(drop=True)
    day3.department_id = 2
    day3.manager_id = 5

    # Reset this persons values for clarity
    day3.loc[0, "record_deleted"] = True
    day3.loc[0, "department_id"] = 1
    day3.loc[0, "manager_id"] = 17

    deltas = {"day1": day1, "day2": day2, "day3": day3}

    return deltas
Exemple #21
0
    def test_concatenazione(self):
        import datetime
        clienti = pd.Series(['paco', 'vittoria', 'francesco'],
                            dtype=pd.StringDtype())
        indirizzi = [np.nan, 'via 1', 'piaza 2']
        eta = [16, 56, 98]
        clienti_df = pd.DataFrame({
            'cliente': clienti,
            'indirizzo': indirizzi,
            'eta': eta
        })
        nome = ['uno', 'altro', 'non so']
        data = [
            datetime.datetime.now(),
            datetime.datetime.now(),
            datetime.datetime(year=2020, month=3, day=23)
        ]
        motivi_df = pd.DataFrame({
            'nome': nome,
            'data': data,
            'usuario': clienti
        })

        df_col = pd.concat([motivi_df, clienti_df], axis=1)
        print("Axis1: \n", df_col)

        df_row = pd.concat([motivi_df, clienti_df], axis=0)
        print("Axis0: \n", df_row)

        print(
            "merge per i clienti/usuari:\n",
            pd.merge(clienti_df,
                     motivi_df,
                     left_on='cliente',
                     right_on='usuario',
                     left_index=True))

        print("join:\n", clienti_df.join(motivi_df))
Exemple #22
0
    def prepareSchema(df_covid, df_popul, country_selected):
        """
            Removes, renames columns and changes datatypes for necessary dataframes
        """
        # Rename columns
        df_covid.rename(columns={"Country/Region": "Country"}, inplace=True)
        df_covid.rename(columns={"Province/State": "State"}, inplace=True)
        df_popul.rename(columns={"Country Name": "Country"}, inplace=True)
        df_popul.rename(columns={"Year": "Date"}, inplace=True)
        # Drop columns
        df_covid.drop(columns=['State', "Lat", "Long"],
                      inplace=True,
                      errors="ignore")
        df_popul.drop(columns=['Country Code'], inplace=True, errors="ignore")
        # Drop nan values
        df_covid.dropna(subset=['Confirmed', "Recovered", "Deaths"],
                        inplace=True)

        # Drop all countries not selected
        if len(country_selected) == 0:
            country_selected = list(df_covid["Country"].unique())

        df_covid = df_covid[df_covid["Country"].isin(country_selected)].copy()

        # Convert columns to proper datatypes
        df_covid["Date"] = df_covid["Date"].apply(pd.to_datetime)
        df_covid["Country"] = df_covid["Country"].astype(pd.StringDtype())
        df_covid["Confirmed"] = df_covid["Confirmed"].astype(np.int)
        df_covid["Recovered"] = df_covid["Recovered"].astype(np.int)
        df_covid["Deaths"] = df_covid["Deaths"].astype(np.int)

        df_popul['Date'] = pd.to_datetime(df_popul['Date'], format="%Y")
        df_popul["Value"] = df_popul["Value"].astype(np.int)

        # Group values by Country with different regions
        df_covid = df_covid.groupby(["Date", "Country"], as_index=False).sum()

        return df_covid, df_popul
Exemple #23
0
def test_coerce_field_types():
    df = pd.DataFrame({
        "QN": ["1"],
        "RS_IND_01": ["1"],
        "DATE": ["1970010100"],
        "END_OF_INTERVAL": ["1970010100:00"],
        "V_VV_I": ["P"],
    })

    expected_df = pd.DataFrame({
        "QN":
        pd.Series([1], dtype=pd.Int64Dtype()),
        "RS_IND_01":
        pd.Series([1], dtype=pd.Int64Dtype()),
        "DATE": [pd.Timestamp("1970-01-01")],
        "END_OF_INTERVAL": [pd.Timestamp("1970-01-01")],
        "V_VV_I":
        pd.Series(["P"], dtype=pd.StringDtype()),
    })

    df = coerce_field_types(df, DWDObservationResolution.HOURLY)

    assert_frame_equal(df, expected_df)
Exemple #24
0
def pd_to_json(
    df: pd.DataFrame,
    output_file: Union[IO, str],
    orient: str = "records",
    lines: bool = True,
    **kwargs,
):
    """Export a dataframe to a json newlines file this package can open identically.

    Converts period data types to datetime strings with precision to the second.

    Args:
        df (pd.DataFrame): a pandas dataframe
        output_file (IO or str): the path you want to export to
        index (bool): standard pandas .to_json index argument, but defaulting to False
        orient (str): standard pandas .to_json orient argument, defaulting to 'records'
        lines (bool): standard pandas .to_json lines argument, defaulting to True
        **kwargs: any other keyword arguments to pass to pandas .to_json
    """
    new = df.copy()

    # Convert date-related columns to strings Arrow can read consistently
    for col in new.columns:
        if pd.api.types.is_period_dtype(new[col]):
            new[col] = new[col].dt.strftime("%Y-%m-%d %H:%M:%S")
        elif any([
                pd.api.types.is_datetime64_any_dtype(new[col]),
                isinstance(
                    new[col][new[col].notnull()].iloc[0],
                    (datetime.datetime, datetime.date),
                ),
        ]):
            new[col] = new[col].astype(pd.StringDtype())
            # Convert pd_timestamp string 'NaT' to NaN so PyArrow can read them
            new[col].replace("NaT", np.nan, regex=False, inplace=True)

    new.to_json(output_file, orient=orient, lines=lines, **kwargs)
Exemple #25
0
def test_df_pdv1_types():
    pdv1_test_mapping = {
        'int8col': {
            'vals': [1, 2, 3],
            'pd_type': pd.Int8Dtype()
        },
        'int16col': {
            'vals': [1, 2, 3],
            'pd_type': pd.Int16Dtype()
        },
        'int32col': {
            'vals': [1, 2, 3],
            'pd_type': pd.Int32Dtype()
        },
        'int64col': {
            'vals': [1, 2, 3],
            'pd_type': pd.Int64Dtype()
        },
        'stringcol': {
            'vals': ['one', 'two', 'three'],
            'pd_type': pd.StringDtype()
        },
        'boolcol': {
            'vals': [True, False, True],
            'pd_type': pd.BooleanDtype()
        }
    }
    pdv1_df = pd.DataFrame({
        col_name: col_meta['vals']
        for col_name, col_meta in pdv1_test_mapping.items()
    })

    pdv1_df = pdv1_df.astype({
        col_name: col_meta['pd_type']
        for col_name, col_meta in pdv1_test_mapping.items()
    })
    return pdv1_df
Exemple #26
0
def zero_pad_zips(zip_series, n_digits):
    """
    Retain prefix zeros in zipcodes.

    Args:
        zip_series (pd.Series) : series containing the zipcode values.
        n_digits(int) : zipcode length (likely 4 or 5 digits).

    Returns:
        pandas.Series: a series containing zipcodes with their prefix zeros
        intact and invalid zipcodes rendered as na.

    """

    # Add preceeding zeros where necessary and get rid of decimal zeros
    def get_rid_of_decimal(series):
        return series.str.replace(r'[\.]+\d*', '', regex=True)

    zip_series = (
        zip_series.astype(pd.StringDtype()).replace('nan', np.nan).fillna(
            "0").pipe(get_rid_of_decimal).str.zfill(n_digits).replace(
                {n_digits * "0": pd.NA})  # All-zero Zip codes aren't valid.
    )
    return zip_series
Exemple #27
0
def test_coerce_field_types_with_nans():
    df = pd.DataFrame({
        "QN": [pd.NA, np.nan, "1"],
        "RS_IND_01": [pd.NA, np.nan, "1"],
        "V_VV_I": [pd.NA, np.nan, "P"],
    })

    expected_df = pd.DataFrame({
        "QN":
        pd.Series([pd.NA, np.nan, 1], dtype=pd.Int64Dtype()),
        "RS_IND_01":
        pd.Series([pd.NA, np.nan, 1], dtype=pd.Int64Dtype()),
        "V_VV_I":
        pd.Series([pd.NA, np.nan, "P"], dtype=pd.StringDtype()),
    })

    def __init__(self):
        self.tidy_data = False

    with mock.patch.object(DWDObservationData, "__init__", new=__init__):

        df = DWDObservationData()._coerce_parameter_types(df)

    assert_frame_equal(df, expected_df)
Exemple #28
0
def load_payments(loc):
    """ Load data from plm_payments.csv

        Parameters
        ----------
        loc : str
            location of data dump

        Returns
        -------
        DataFrame of payments csv with selected columns
    """
    payments_dtype_dict = {'Loan ID': 'Int64', 'Type': pd.StringDtype(),
                           'Amount': float, 'Reversed': bool, 'Other Fee Paid': float,
                           'Late Fee Paid': float, 'Nsf Fee Paid': float, 'CSO/Setup Fee Paid': float,
                           'Back Interest Paid': float, 'Interest Paid': float,
                           'Principal Paid': float, 'Days Late': 'Int64', 'Comments': pd.StringDtype()}
    payments_dates = ['Date', 'New Due Date', 'Cleared Date']
    payments_csv = pd.read_csv(loc + '/plm_payments.csv',
                               dtype=payments_dtype_dict,
                               usecols=list(payments_dtype_dict.keys()) + payments_dates,
                               parse_dates=payments_dates)
    print('Loading Payments CSV Complete')
    return payments_csv.sort_values('Loan ID')
    def test_to_table_nullable(self):
        boolean_array = pd.array([True, False, None], dtype=pd.BooleanDtype())
        int8_array = pd.array([1, 2, None], dtype=pd.Int8Dtype())
        int16_array = pd.array([1, 2, None], dtype=pd.Int16Dtype())
        int32_array = pd.array([1, 2, None], dtype=pd.Int32Dtype())
        int64_array = pd.array([1, 2, None], dtype=pd.Int64Dtype())
        float_array = pd.array([1.1, 2.2, None], dtype=pd.Float32Dtype())
        double_array = pd.array([1.1, 2.2, None], dtype=pd.Float64Dtype())
        string_array = pd.array(["s11", "s22", None], dtype=pd.StringDtype())
        object_array = pd.array([pd.NA, "s22", None], dtype=object)

        df = pd.DataFrame({
            "NullableBoolean": boolean_array,
            "NullableInt8": int8_array,
            "NullableInt16": int16_array,
            "NullableInt32": int32_array,
            "NullableInt64": int64_array,
            "NullableFloat": float_array,
            "NullableDouble": double_array,
            "NullableString": string_array,
            "NullableObject": object_array,
        })

        table = to_table(df)
        self.assertIs(table.columns[0].data_type, dtypes.bool_)
        self.assertIs(table.columns[1].data_type, dtypes.int8)
        self.assertIs(table.columns[2].data_type, dtypes.int16)
        self.assertIs(table.columns[3].data_type, dtypes.int32)
        self.assertIs(table.columns[4].data_type, dtypes.int64)
        self.assertIs(table.columns[5].data_type, dtypes.float32)
        self.assertIs(table.columns[6].data_type, dtypes.double)
        self.assertIs(table.columns[7].data_type, dtypes.string)
        self.assertIs(table.columns[8].data_type, dtypes.PyObject)
        self.assertEqual(table.size, 3)
        table_string = table.to_string()
        self.assertEqual(9, table_string.count("null"))
def getDataset():

    # On charge le fichier
    dataset = pd.read_csv('default-cards.csv', header=0, sep=';')

    # On affecte le bon type à chaque colonnes
    dataset.name = dataset.name.astype(pd.StringDtype())
    dataset.color = dataset.color.astype(pd.StringDtype())
    dataset.cmc = dataset.cmc.astype(pd.Int64Dtype())
    dataset.set = dataset.set.astype(pd.StringDtype())
    dataset.rarity = dataset.rarity.astype(
        pd.CategoricalDtype(
            categories=['common', 'uncommon', 'rare', 'mythic'], ordered=True))
    dataset.text = dataset.text.astype(pd.StringDtype())
    dataset.power = dataset.power.astype('float').astype(pd.Int64Dtype())
    dataset.toughness = dataset.toughness.astype('float').astype(
        pd.Int64Dtype())
    dataset.price_usd = dataset.price_usd.astype('float64')
    dataset.price_eur = dataset.price_eur.astype('float64')
    dataset.type = dataset.type.astype(pd.StringDtype())
    dataset.subtype = dataset.subtype.astype(pd.StringDtype())

    return dataset