Exemple #1
0
    def format(cls, df):
        df.columns = df.columns.str.strip()
        df.columns = df.columns.str.lower()
        df = df.apply(lambda x: x.str.lower() if is_string_dtype(x) else x)
        df = df.apply(lambda x: x.str.strip() if is_string_dtype(x) else x)

        check_matrix = [
            cls.is_columns(df.columns),
            is_bool_dtype(df.is_buried),
            cls.is_road_zone(df.road_zone),
            cls.is_road_type(df.road_type),
            cls.is_element_type(df.element_type),
            cls.is_humidity_level(df.humidity_level),
            cls.is_concrete_class(df.concrete_class),
            is_bool_dtype(df.is_consequence_class_three),
            is_bool_dtype(df.is_racc),
            is_bool_dtype(df.is_drcm),
            is_bool_dtype(df.is_prestressed),
            is_bool_dtype(df.is_corrosion_inhibitor),
            is_bool_dtype(df.is_cpf),
            is_bool_dtype(df.is_stainless_steel)
        ]

        if all(check_matrix):
            return df
        else:
            raise ValueError(
                "Invalid format. Please check", [
                    cls.input_columns[i] for i in range(len(check_matrix)) \
                    if check_matrix[i] == False \
                ] \
            )
Exemple #2
0
    def kdeplot(self, **kwargs):
        if self.config == (1, 0, 0):
            sns.kdeplot(self._obj[self.numerical_[0]].dropna(), **kwargs)
        elif self.config == (1, 1, 0):
            categorical = self._obj[self.categorical_[0]]
            if is_bool_dtype(categorical):
                categorical = categorical.astype("category")
            categories = categorical.cat.categories.tolist()

            if len(categories) > 3:
                warnings.warn(
                    "The cardinality of the categorical variable "
                    "is more than 3. This might cause visual clutter.")

            for category in categories:
                sns.kdeplot(self._obj.loc[categorical == category,
                                          self.numerical_[0]].dropna(),
                            shade=True,
                            **kwargs)
            plt.legend(categories)
        elif self.config == (2, 0, 0):
            sns.jointplot(x=self.numerical_[1],
                          y=self.numerical_[0],
                          data=self._obj,
                          kind="kde",
                          **kwargs)
Exemple #3
0
    def __getitem__(self, key):
        (row_loc, row_scalar, out_ndim) = self._validate_locator(key)

        sr = self.sr
        if row_scalar:
            result = sr._frame.read_at(row_loc)

        elif isinstance(row_loc, slice):
            if row_loc == slice(None):
                result = sr._frame
            else:
                result = sr._frame.slice_rows_by_slice(row_loc, False)

        else:
            row_loc = sr._ensure_valid_frame(row_loc)

            if not row_loc._is_series:
                raise ValueError("indexer must be 1-dimensional")

            if not is_bool_dtype(row_loc.dtype):
                raise err._unsupported_error(
                    "only boolean indexers are supported now")

            # This may raise an exception if the indexer size doesn't match
            # with the index of the LHS.
            row_loc = row_loc._frame.update_legate_index(sr._raw_index)

            result = sr._frame.select(row_loc)

        try:
            return super().construct_result(result, out_ndim, row_scalar)
        except _NotFoundError:
            raise KeyError(row_loc)
Exemple #4
0
    def guess_natsort_alg(cls, dtype: Type[Any]) -> NatsortFlagsAndValue:
        """
        Guesses a good natsorted flag for the dtype.

        Here are some specifics:
            - integers       ⇒ INT and SIGNED
            - floating-point ⇒ FLOAT and SIGNED
            - strings        ⇒ COMPATIBILITYNORMALIZE and GROUPLETTERS
            - datetime       ⇒ GROUPLETTERS (only affects 'Z' vs. 'z'; shouldn't matter)

        Args:
            dtype: Probably from ``pd.Series.dtype``

        Returns:
            A tuple of (set of flags, int) -- see :meth:`exact_natsort_alg`
        """
        st, x = set(), 0
        if is_string_dtype(dtype):
            st.update(["COMPATIBILITYNORMALIZE", "GROUPLETTERS"])
            x |= ns_enum.COMPATIBILITYNORMALIZE | ns_enum.GROUPLETTERS
        elif is_categorical_dtype(dtype):
            pass
        elif is_integer_dtype(dtype) or is_bool_dtype(dtype):
            st.update(["INT", "SIGNED"])
            x |= ns_enum.INT | ns_enum.SIGNED
        elif is_float_dtype(dtype):
            st.update(["FLOAT", "SIGNED"])
            x |= ns_enum.FLOAT | ns_enum.SIGNED  # same as ns_enum.REAL
        return NatsortFlagsAndValue(st, x)
Exemple #5
0
def is_discrete(s):
    """
    Returns
    -------
    bool
        True if the given Series should be considered discrete/categorical."""
    return is_bool_dtype(s) or not is_numeric_dtype(s)
Exemple #6
0
def rapid_test_reactions(states, contacts, params, seed):  # noqa: U100
    """Make people react to a positive rapid tests by reducing their contacts."""
    contacts = contacts.copy(deep=True)

    # we assume that if you haven't received PCR confirmation within 7 days
    # you go back to having contacts.
    received_rapid_test = states["cd_received_rapid_test"].between(
        -5, 0, inclusive=True)
    pos_rapid_test = states["is_tested_positive_by_rapid_test"]
    quarantine_pool = received_rapid_test & pos_rapid_test

    for col in contacts:
        loc = ("rapid_test_demand", "reaction")
        if col == "households":
            multiplier = params.loc[(*loc, "hh_contacts_multiplier"), "value"]
        else:
            multiplier = params.loc[(*loc, "not_hh_contacts_multiplier"),
                                    "value"]
        refuser = states["quarantine_compliance"] <= multiplier
        not_staying_home = refuser | ~quarantine_pool
        # no need to worry about dtypes because post_process_contacts happens
        # after this function is called.

        if is_bool_dtype(contacts[col]):
            contacts[col] = contacts[col].where(cond=not_staying_home,
                                                other=False)
        else:
            contacts[col] = contacts[col].where(cond=not_staying_home, other=0)

    return contacts
Exemple #7
0
    def __getitem__(self, item):
        if isinstance(item, tuple):
            item = unpack_tuple_and_ellipses(item)

        if isinstance(item, numbers.Integral):
            return self.data[item]
        elif isinstance(item, slice) and item == slice(None):
            # Make sure we get a view
            return type(self)(self.data)
        elif isinstance(item, slice):
            # slice
            return type(self)(self.data[item])
        elif not is_list_like(item):
            # e.g. "foo" or 2.5
            # exception message copied from numpy
            raise IndexError(
                r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
                r"(`None`) and integer or boolean arrays are valid indices")
        else:
            item = pd.api.indexers.check_array_indexer(self, item)
            if is_bool_dtype(item.dtype):
                return self._from_sequence(
                    [x for x, m in zip(self, item) if m])
            # integer
            return type(self)([self.data[i] for i in item])
Exemple #8
0
    def __getitem__(self, key):
        """If this gets a number as the key it tries to get the row that is
        nearest to this number. If it is something list-like and the elements
        of the lists are numbers then all the elements of the list are looked
        up, sorted and mad unique. Afterwards it gets the rows that are nearest
        to the numbers. Otherwise it defaults to the []-operator of the
        DataFram-class but converts the result to a PyFoamDataFrame

        """
        idx = None
        if isinstance(key, (float, int)):
            idx = [Series(abs(self.index - key)).idxmin()]
        elif pdtypes.is_list_like(key):
            try:
                k = np.array(key)
                if pdtypes.is_numeric_dtype(k) and not pdtypes.is_bool_dtype(k):
                    idx = []
                    for i in k:
                        nx = Series(abs(self.index - i)).idxmin()
                        if nx not in idx:
                            idx.append(nx)
                        idx.sort()
            except TypeError:
                pass

        if idx is not None:
            return PyFoamDataFrame(self.iloc[idx])

        val = DataFrame.__getitem__(self, key)
        if isinstance(val, DataFrame):
            return PyFoamDataFrame(val)
        else:
            return val
    def __setitem__(self, key: Union[int, np.ndarray, list, slice],
                    value: Any) -> None:
        """
        See docstring in `ExtensionArray` class in `pandas/core/arrays/base.py`
        for information about this method.
        """
        key = check_array_indexer(self, key)
        if isinstance(value, ABCSeries) and isinstance(value.dtype, SpanDtype):
            value = value.values

        if value is None or isinstance(value, Sequence) and len(value) == 0:
            self._begin_tokens[key] = TokenSpan.NULL_OFFSET_VALUE
            self._end_tokens[key] = TokenSpan.NULL_OFFSET_VALUE
        elif isinstance(value, TokenSpan) or \
                ((isinstance(key, slice) or
                  (isinstance(key, np.ndarray) and is_bool_dtype(key.dtype))) and
                 isinstance(value, SpanArray)):
            self._begin_tokens[key] = value.begin_token
            self._end_tokens[key] = value.end_token
        elif isinstance(key, np.ndarray) and len(value) > 0 and len(value) == len(key) and \
                ((isinstance(value, Sequence) and isinstance(value[0], TokenSpan)) or
                 isinstance(value, TokenSpanArray)):
            for k, v in zip(key, value):
                self._begin_tokens[k] = v.begin_token
                self._end_tokens[k] = v.end_token
        else:
            raise ValueError(
                f"Attempted to set element of TokenSpanArray with "
                f"an object of type {type(value)}; current set of "
                f"allowed types is {(TokenSpan, TokenSpanArray)}")

        self._clear_cached_properties()
Exemple #10
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
            if index_ops.hasnans:
                raise ValueError(
                    "Cannot convert %s with missing values to integer" % self.pretty_name
                )
        elif is_bool_dtype(dtype) and not isinstance(dtype, extension_dtypes):
            if index_ops.hasnans:
                raise ValueError("Cannot convert %s with missing values to bool" % self.pretty_name)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(
                    index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column),
                    SF.lit(True),
                ).otherwise(index_ops.spark.column.cast(spark_type))
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(np.nan))
        else:
            return _as_other_type(index_ops, dtype, spark_type)
    def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
        """
        See docstring in `ExtensionArray` class in `pandas/core/arrays/base.py`
        for information about this method.
        """

        key = check_array_indexer(self, key)
        if isinstance(value, ABCSeries) and isinstance(value.dtype, SpanDtype):
            value = value.values

        if value is None or isinstance(value, Sequence) and len(value) == 0:
            self._begins[key] = Span.NULL_OFFSET_VALUE
            self._ends[key] = Span.NULL_OFFSET_VALUE
        elif isinstance(value, Span) or \
                ((isinstance(key, slice) or
                  (isinstance(key, np.ndarray) and is_bool_dtype(key.dtype)))
                 and isinstance(value, SpanArray)):
            self._begins[key] = value.begin
            self._ends[key] = value.end
        elif isinstance(key, np.ndarray) and len(value) > 0 and len(value) == len(key) and \
                ((isinstance(value, Sequence) and isinstance(value[0], Span)) or
                 isinstance(value, SpanArray)):
            for k, v in zip(key, value):
                self._begins[k] = v.begin
                self._ends[k] = v.end
        else:
            raise ValueError(f"Attempted to set element of SpanArray with "
                             f"an object of type {type(value)}")
        # We just changed the contents of this array, so invalidate any cached
        # results computed from those contents.
        self.increment_version()
Exemple #12
0
def series_dtype(s: pd.Series) -> VarType:
    """
    Computes the type of a pandas series.

    Parameters
    ----------
    s : pd.Series
        The series for which we wish to determine the type.

    Returns
    -------
    VarType
    """
    if is_bool_dtype(s):
        return VarType.TYPE_CAT
    elif is_string_dtype(s):
        return VarType.TYPE_CAT
    elif is_categorical_dtype(s):
        return VarType.TYPE_CAT
    elif is_numeric_dtype(s):
        if numeric_is_continuous(s):
            return VarType.TYPE_NUM
        else:
            return VarType.TYPE_CAT
    else:
        return VarType.TYPE_UNSUPPORTED
Exemple #13
0
def test_is_bool_dtype(data):
    assert is_bool_dtype(data)
    assert pd.core.common.is_bool_indexer(data)
    s = pd.Series(range(len(data)))
    result = s[data]
    expected = s[np.asarray(data)]
    tm.assert_series_equal(result, expected)
Exemple #14
0
    def __getitem__(self, item):
        if isinstance(item, tuple):
            if len(item) > 1:
                if item[0] is Ellipsis:
                    item = item[1:]
                elif item[-1] is Ellipsis:
                    item = item[:-1]
            if len(item) > 1:
                raise IndexError("too many indices for array.")
            item = item[0]

        if isinstance(item, numbers.Integral):
            return self.data[item]
        elif isinstance(item, slice) and item == slice(None):
            # Make sure we get a view
            return type(self)(self.data)
        elif isinstance(item, slice):
            # slice
            return type(self)(self.data[item])
        else:
            item = pd.api.indexers.check_array_indexer(self, item)
            if is_bool_dtype(item.dtype):
                return self._from_sequence(
                    [x for x, m in zip(self, item) if m])
            # integer
            return type(self)([self.data[i] for i in item])
Exemple #15
0
def _infer_task(df, x, y):
    "Returns str with the name of the inferred task based on the columns x and y"
    if x == y:
        return "predict_itself"

    category_count = df[y].value_counts().count()
    if category_count == 1:
        return "predict_constant"
    if category_count == 2:
        return "classification"
    if category_count == len(df[y]) and (
        is_string_dtype(df[y]) or is_categorical_dtype(df[y])
    ):
        return "predict_id"
    if category_count <= NUMERIC_AS_CATEGORIC_BREAKPOINT and is_numeric_dtype(df[y]):
        return "classification"

    if is_bool_dtype(df[y]) or is_string_dtype(df[y]) or is_categorical_dtype(df[y]):
        return "classification"

    if is_datetime64_any_dtype(df[y]) or is_timedelta64_dtype(df[y]):
        raise Exception(
            f"The target column {y} has the dtype {df[y].dtype} which is not supported. A possible solution might be to convert {y} to a string column"
        )

    # this check needs to be after is_bool_dtype because bool is considered numeric by pandas
    if is_numeric_dtype(df[y]):
        return "regression"

    raise Exception(
        f"Could not infer a valid task based on the target {y}. The dtype {df[y].dtype} is not yet supported"
    )  # pragma: no cover
Exemple #16
0
def series_is_boolean(col: pd.Series or pd.Index):
    """
    returns:
        None if column is all None;
        True if a pd.Series contains True, False, and None;
        False otherwise

    caveat: does not interpret all-zero or all-one columns as boolean"""
    if len(col.unique()) == 1 and col.unique()[0] is None:
        # return None for all-None columns
        return None
    elif col.isna().all():
        return None
    elif is_bool_dtype(col):
        return True
    elif is_object_dtype(col):
        for val in col.unique():
            if val not in [True, False, None]:
                return False
        return False in col.unique() and True in col.unique()
    elif is_integer_dtype(col) or is_float_dtype(col):
        for val in col.unique():
            if pd.isna(val):
                continue
            if val not in [1, 0, None]:
                return False
            if 0 not in col.unique() or 1 not in col.unique():
                return False
        return True
    return False
Exemple #17
0
def to_data_table(data: pd.DataFrame):
    """
    Create a BOOM DataTable object from a pandas DataFrame.  The categories of
    any categorical variables will be handled as strings.
    """
    dtypes = data.dtypes
    ans = boom.DataTable()
    for i in range(data.shape[1]):
        dt = dtypes[i]
        vname = data.columns[i]
        if is_numeric_dtype(dt) or is_bool_dtype(dt):
            ans.add_numeric(boom.Vector(data.iloc[:, i].values.astype("float")),
                            vname)
        elif is_categorical_dtype(dt):
            x = data.iloc[:, i]
            values = x.cat.codes
            codes = x.cat.categories
            ans.add_categorical(values, codes, vname)
        elif is_object_dtype(dt):
            labels = data.iloc[:, i].astype("str")
            ans.add_categorical_from_labels(labels.values, vname)
        else:
            raise Exception(
                f"Only numeric or categorical data are supported.  "
                f"Column {i} ({data.columns[i]}) has dtype {dt}."
            )
    return ans
Exemple #18
0
 def update_attributes(self, column):
     if is_bool_dtype(column):
         col_type = 'logical'
     elif is_numeric_dtype(column):
         col_type = 'numeric'
     elif is_object_dtype(column):
         col_type = 'categorical'
     else:
         raise Exception('Column type is not supported')
     if self.type is None:
         self.type = col_type
     elif self.type != col_type:
         raise Exception(
             'The same name was used for columns with different type')
     if self.type == 'numeric':
         self.min = min(column.min(),
                        math.inf if self.min is None else self.min)
         self.max = max(column.max(),
                        -math.inf if self.max is None else self.max)
         if len(column.unique()) < 20:
             self.levels = np.unique((self.levels or []) +
                                     column.unique().tolist()).tolist()
         if not self.levels is None and len(self.levels) >= 20:
             self.levels = None
     else:
         self.levels = np.unique((self.levels or []) +
                                 column.unique().tolist()).tolist()
Exemple #19
0
    def contains_op(cls, series: pd.Series, state: dict) -> bool:
        is_valid_dtype = pdt.is_categorical_dtype(
            series) and not pdt.is_bool_dtype(series)
        if is_valid_dtype:
            return True

        return series_is_string(series)
def check_if_series_has_internal_type(series, internal_type):
    """Check if data type of series fits to the internal type of gettsim.

    Parameters
    ----------
    series : pd.Series
        Some data series.
    internal_type : TypeVar
        One of the internal gettsim types.

    Returns
    -------
    out : bool
        Return check variable.
    """
    if internal_type == FloatSeries:
        out = is_float_dtype(series) or is_integer_dtype(series)
    elif internal_type == BoolSeries:
        out = is_bool_dtype(series)
    elif internal_type == IntSeries:
        out = is_integer_dtype(series)
    elif internal_type == DateTimeSeries:
        out = is_datetime64_any_dtype(series)
    else:
        raise ValueError(f"The internal type {internal_type} is not defined.")
    return out
Exemple #21
0
    def test_above_100_datatype(self):
        for file in self.list_of_output_files:
            file_path = os.path.join(self.output_path, file)
            data = pd.read_csv(file_path)

            above_100_datatype = data['above_100']
            self.assertTrue(is_bool_dtype(above_100_datatype))
Exemple #22
0
    def find_atoms(self, data: pd.DataFrame):
        """
        Find the numeric atoms and categorical levels to be modeled.
        """
        self._dtypes = data.dtypes
        atoms_dict = {}
        levels_dict = {}
        for i in range(data.shape[1]):
            vname = data.columns[i]
            dt = self._dtypes[i]

            if is_numeric_dtype(dt):
                variable = data.iloc[:, i]
                counts = variable.value_counts().sort_values(ascending=False)
                number_observed = counts.sum()
                atom_indicator = counts > 0.05 * number_observed
                atoms = counts[atom_indicator].index.tolist()
                if len(atoms) > 3:
                    atoms = atoms[:3]
                atoms_dict[vname] = atoms
                self._numeric_colnames.append(data.columns[i])

            elif (is_categorical_dtype(dt) or is_object_dtype(dt)
                  or is_bool_dtype(dt)):
                # TODO: put in some cardinality protections.
                levels = data.iloc[:, i].value_counts()
                levels_dict[vname] = levels
                self._categorical_colnames.append(data.columns[i])

            else:
                raise Exception(
                    "Only categorical or numeric types are supported.")

        return atoms_dict, levels_dict
Exemple #23
0
 def variable_type(self):
     if is_numeric_dtype(self.data) and not is_bool_dtype(self.data):
         # Only int and float types
         return 'numeric'
     else:
         # Handle bool, string, datetime, etc as categorical
         self.data = self.data.astype('category')
         return 'categorical'
Exemple #24
0
    def contains_op(cls, series: pd.Series, state: dict) -> bool:
        if pdt.is_object_dtype(series):
            try:
                return series.isin({True, False}).all()
            except:
                return False

        return pdt.is_bool_dtype(series)
Exemple #25
0
def get_formatter(dtype):
    if types.is_datetime64_any_dtype(dtype):
        return DateFormatter(format="%Y-%m-%d %H:%M:%S.%N")
    elif types.is_bool_dtype(dtype):
        return None
        # return BooleanFormatter()
    else:
        return None
Exemple #26
0
def _dtype_represents_categories(series) -> bool:
    "Determines if the dtype of the series represents categorical values"
    return (
        is_bool_dtype(series)
        or is_object_dtype(series)
        or is_string_dtype(series)
        or is_categorical_dtype(series)
    )
Exemple #27
0
    def test_search(self, mp_wfs, mp_remote_describefeaturetype, mp_remote_md,
                    mp_remote_fc, mp_remote_wfs_feature, mp_dov_xml):
        """Test the search method with only the query parameter.

        Test whether the result is correct.

        Parameters
        ----------
        mp_wfs : pytest.fixture
            Monkeypatch the call to the remote GetCapabilities request.
        mp_remote_describefeaturetype : pytest.fixture
            Monkeypatch the call to a remote DescribeFeatureType.
        mp_remote_md : pytest.fixture
            Monkeypatch the call to get the remote metadata.
        mp_remote_fc : pytest.fixture
            Monkeypatch the call to get the remote feature catalogue.
        mp_remote_wfs_feature : pytest.fixture
            Monkeypatch the call to get WFS features.
        mp_dov_xml : pytest.fixture
            Monkeypatch the call to get the remote XML data.

        """
        df = self.get_search_object().search(
            query=self.get_valid_query_single())

        assert type(df) is DataFrame

        assert list(df) == self.get_df_default_columns()

        datatype = self.get_type()
        allfields = datatype.get_field_names()
        ownfields = datatype.get_field_names(include_subtypes=False)
        subfields = [f for f in allfields if f not in ownfields]

        assert len(df) >= 1

        for field in list(df):
            if field in ownfields:
                assert len(df[field].unique()) == 1
            elif field in subfields:
                assert len(df[field].unique()) >= 1

        # dtype checks of the resulting df columns
        fields = self.get_type().get_fields(source=('wfs', 'xml', 'custom'))

        for field in list(df):
            datatype = fields[field]['type']
            if datatype == 'string':
                assert (is_object_dtype(df[field])
                        or df[field].isnull().values.all())  # all Nan/None
            elif datatype == 'float':
                assert is_float_dtype(df[field])
            elif datatype == 'integer':
                assert is_int64_dtype(df[field])
            elif datatype == 'date':
                assert is_object_dtype(df[field])
            elif datatype == 'boolean':
                assert is_bool_dtype(df[field])
Exemple #28
0
    def cond_ind(self, a_col_name, b_col_name, cond_col_names, alpha):
        a = self.df[a_col_name]
        b = self.df[b_col_name]
        conds = [self.df[z] for z in cond_col_names]

        if is_bool_dtype(a) and is_bool_dtype(b) and all([is_bool_dtype(z) for z in conds]):
            stat, pval = cmh(a, b, conds)

        elif (is_categorical_dtype(a) or is_bool_dtype(a)) and \
                (is_categorical_dtype(b) or is_bool_dtype(b)) and \
                all([is_categorical_dtype(z) or is_bool_dtype(z) for z in conds]):
            stat, pval = chisq_3d(a, b, conds)

        elif is_numeric_dtype(a) and \
                is_numeric_dtype(b) and \
                all([is_categorical_dtype(z) or is_bool_dtype(z) for z in conds]):
            stat, pval = blocked_pearson(a, b, conds)

        elif is_numeric_dtype(a) and \
                is_numeric_dtype(b) and \
                all([is_numeric_dtype(z) for z in conds]):
            stat, pval = partial_corr_r(a, b, conds)

        else:
            stat, pval = freak out

        return pval < alpha
Exemple #29
0
 def derive_pack_func(col):
     if pdtypes.is_string_dtype(col):
         return pack_string_func
     elif pdtypes.is_bool_dtype(col):
         return pack_bool_func
     elif pdtypes.is_numeric_dtype(col):
         return pack_numeric_func
     else:
         return pack_null_func
Exemple #30
0
def is_nullable_dtype(dtype: Any) -> bool:
    """Wether dtype is a pandas nullable type."""
    from pandas.api.types import is_integer_dtype, is_bool_dtype
    # dtype: pd.core.arrays.numeric.NumericDtype
    nullable_alias = {"Int16", "Int32", "Int64"}
    is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
    # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
    is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
    return is_int or is_bool
Exemple #31
0
    def universal_dataset_check(self, dataset_name, object_headers=None,
                                numeric_headers=None, bool_headers=None,
                                test_func=None):

        # "Hard" integrity checks that take a long time.
        # These tests only run if the MATMINER_DATASET_FULL_TEST
        # environment variable is set to True
        if do_complete_test:
            # Get rid of dataset if it's on the disk already
            data_path = os.path.join(
                self.dataset_dir,
                dataset_name + "." + self.dataset_dict[dataset_name][
                    'file_type'
                ]
            )
            if os.path.exists(data_path):
                os.remove(data_path)

            # Test that dataset can be downloaded
            load_dataset(dataset_name)
            self.assertTrue(os.path.exists(data_path))

            # Test that data is now available and has all its elements
            df = load_dataset(dataset_name, download_if_missing=False)
            self.assertEqual(
                len(df), self.dataset_dict[dataset_name]["num_entries"]
            )

            # Test all columns are there
            self.assertEqual(sorted(list(df)), sorted(
                [header for header in
                 self.dataset_dict[dataset_name]['columns'].keys()]
            ))

            # Test each column for appropriate type
            if object_headers is None:
                object_headers = []
            if numeric_headers is None:
                numeric_headers = []
            if bool_headers is None:
                bool_headers = []

            df = load_dataset(dataset_name, download_if_missing=False)
            if object_headers:
                self.assertTrue(is_object_dtype(df[object_headers].values))
            if numeric_headers:
                self.assertTrue(is_numeric_dtype(df[numeric_headers].values))
            if bool_headers:
                self.assertTrue(is_bool_dtype(df[bool_headers].values))

            # Make sure all columns are accounted for
            column_headers = object_headers + numeric_headers + bool_headers
            self.assertEqual(sorted(list(df)), sorted(column_headers))

            # Run tests unique to the dataset
            if test_func is not None:
                test_func(df)

        # "Soft" check that just makes sure the dataset download page is active
        # This runs when on a system with the CI environment var present
        # (e.g. when running a continuous integration VCS system)
        else:
            download_page = requests.head(
                self.dataset_dict[dataset_name]["url"]
            )
            self.assertTrue(download_page.ok)