def format(cls, df): df.columns = df.columns.str.strip() df.columns = df.columns.str.lower() df = df.apply(lambda x: x.str.lower() if is_string_dtype(x) else x) df = df.apply(lambda x: x.str.strip() if is_string_dtype(x) else x) check_matrix = [ cls.is_columns(df.columns), is_bool_dtype(df.is_buried), cls.is_road_zone(df.road_zone), cls.is_road_type(df.road_type), cls.is_element_type(df.element_type), cls.is_humidity_level(df.humidity_level), cls.is_concrete_class(df.concrete_class), is_bool_dtype(df.is_consequence_class_three), is_bool_dtype(df.is_racc), is_bool_dtype(df.is_drcm), is_bool_dtype(df.is_prestressed), is_bool_dtype(df.is_corrosion_inhibitor), is_bool_dtype(df.is_cpf), is_bool_dtype(df.is_stainless_steel) ] if all(check_matrix): return df else: raise ValueError( "Invalid format. Please check", [ cls.input_columns[i] for i in range(len(check_matrix)) \ if check_matrix[i] == False \ ] \ )
def kdeplot(self, **kwargs): if self.config == (1, 0, 0): sns.kdeplot(self._obj[self.numerical_[0]].dropna(), **kwargs) elif self.config == (1, 1, 0): categorical = self._obj[self.categorical_[0]] if is_bool_dtype(categorical): categorical = categorical.astype("category") categories = categorical.cat.categories.tolist() if len(categories) > 3: warnings.warn( "The cardinality of the categorical variable " "is more than 3. This might cause visual clutter.") for category in categories: sns.kdeplot(self._obj.loc[categorical == category, self.numerical_[0]].dropna(), shade=True, **kwargs) plt.legend(categories) elif self.config == (2, 0, 0): sns.jointplot(x=self.numerical_[1], y=self.numerical_[0], data=self._obj, kind="kde", **kwargs)
def __getitem__(self, key): (row_loc, row_scalar, out_ndim) = self._validate_locator(key) sr = self.sr if row_scalar: result = sr._frame.read_at(row_loc) elif isinstance(row_loc, slice): if row_loc == slice(None): result = sr._frame else: result = sr._frame.slice_rows_by_slice(row_loc, False) else: row_loc = sr._ensure_valid_frame(row_loc) if not row_loc._is_series: raise ValueError("indexer must be 1-dimensional") if not is_bool_dtype(row_loc.dtype): raise err._unsupported_error( "only boolean indexers are supported now") # This may raise an exception if the indexer size doesn't match # with the index of the LHS. row_loc = row_loc._frame.update_legate_index(sr._raw_index) result = sr._frame.select(row_loc) try: return super().construct_result(result, out_ndim, row_scalar) except _NotFoundError: raise KeyError(row_loc)
def guess_natsort_alg(cls, dtype: Type[Any]) -> NatsortFlagsAndValue: """ Guesses a good natsorted flag for the dtype. Here are some specifics: - integers ⇒ INT and SIGNED - floating-point ⇒ FLOAT and SIGNED - strings ⇒ COMPATIBILITYNORMALIZE and GROUPLETTERS - datetime ⇒ GROUPLETTERS (only affects 'Z' vs. 'z'; shouldn't matter) Args: dtype: Probably from ``pd.Series.dtype`` Returns: A tuple of (set of flags, int) -- see :meth:`exact_natsort_alg` """ st, x = set(), 0 if is_string_dtype(dtype): st.update(["COMPATIBILITYNORMALIZE", "GROUPLETTERS"]) x |= ns_enum.COMPATIBILITYNORMALIZE | ns_enum.GROUPLETTERS elif is_categorical_dtype(dtype): pass elif is_integer_dtype(dtype) or is_bool_dtype(dtype): st.update(["INT", "SIGNED"]) x |= ns_enum.INT | ns_enum.SIGNED elif is_float_dtype(dtype): st.update(["FLOAT", "SIGNED"]) x |= ns_enum.FLOAT | ns_enum.SIGNED # same as ns_enum.REAL return NatsortFlagsAndValue(st, x)
def is_discrete(s): """ Returns ------- bool True if the given Series should be considered discrete/categorical.""" return is_bool_dtype(s) or not is_numeric_dtype(s)
def rapid_test_reactions(states, contacts, params, seed): # noqa: U100 """Make people react to a positive rapid tests by reducing their contacts.""" contacts = contacts.copy(deep=True) # we assume that if you haven't received PCR confirmation within 7 days # you go back to having contacts. received_rapid_test = states["cd_received_rapid_test"].between( -5, 0, inclusive=True) pos_rapid_test = states["is_tested_positive_by_rapid_test"] quarantine_pool = received_rapid_test & pos_rapid_test for col in contacts: loc = ("rapid_test_demand", "reaction") if col == "households": multiplier = params.loc[(*loc, "hh_contacts_multiplier"), "value"] else: multiplier = params.loc[(*loc, "not_hh_contacts_multiplier"), "value"] refuser = states["quarantine_compliance"] <= multiplier not_staying_home = refuser | ~quarantine_pool # no need to worry about dtypes because post_process_contacts happens # after this function is called. if is_bool_dtype(contacts[col]): contacts[col] = contacts[col].where(cond=not_staying_home, other=False) else: contacts[col] = contacts[col].where(cond=not_staying_home, other=0) return contacts
def __getitem__(self, item): if isinstance(item, tuple): item = unpack_tuple_and_ellipses(item) if isinstance(item, numbers.Integral): return self.data[item] elif isinstance(item, slice) and item == slice(None): # Make sure we get a view return type(self)(self.data) elif isinstance(item, slice): # slice return type(self)(self.data[item]) elif not is_list_like(item): # e.g. "foo" or 2.5 # exception message copied from numpy raise IndexError( r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " r"(`None`) and integer or boolean arrays are valid indices") else: item = pd.api.indexers.check_array_indexer(self, item) if is_bool_dtype(item.dtype): return self._from_sequence( [x for x, m in zip(self, item) if m]) # integer return type(self)([self.data[i] for i in item])
def __getitem__(self, key): """If this gets a number as the key it tries to get the row that is nearest to this number. If it is something list-like and the elements of the lists are numbers then all the elements of the list are looked up, sorted and mad unique. Afterwards it gets the rows that are nearest to the numbers. Otherwise it defaults to the []-operator of the DataFram-class but converts the result to a PyFoamDataFrame """ idx = None if isinstance(key, (float, int)): idx = [Series(abs(self.index - key)).idxmin()] elif pdtypes.is_list_like(key): try: k = np.array(key) if pdtypes.is_numeric_dtype(k) and not pdtypes.is_bool_dtype(k): idx = [] for i in k: nx = Series(abs(self.index - i)).idxmin() if nx not in idx: idx.append(nx) idx.sort() except TypeError: pass if idx is not None: return PyFoamDataFrame(self.iloc[idx]) val = DataFrame.__getitem__(self, key) if isinstance(val, DataFrame): return PyFoamDataFrame(val) else: return val
def __setitem__(self, key: Union[int, np.ndarray, list, slice], value: Any) -> None: """ See docstring in `ExtensionArray` class in `pandas/core/arrays/base.py` for information about this method. """ key = check_array_indexer(self, key) if isinstance(value, ABCSeries) and isinstance(value.dtype, SpanDtype): value = value.values if value is None or isinstance(value, Sequence) and len(value) == 0: self._begin_tokens[key] = TokenSpan.NULL_OFFSET_VALUE self._end_tokens[key] = TokenSpan.NULL_OFFSET_VALUE elif isinstance(value, TokenSpan) or \ ((isinstance(key, slice) or (isinstance(key, np.ndarray) and is_bool_dtype(key.dtype))) and isinstance(value, SpanArray)): self._begin_tokens[key] = value.begin_token self._end_tokens[key] = value.end_token elif isinstance(key, np.ndarray) and len(value) > 0 and len(value) == len(key) and \ ((isinstance(value, Sequence) and isinstance(value[0], TokenSpan)) or isinstance(value, TokenSpanArray)): for k, v in zip(key, value): self._begin_tokens[k] = v.begin_token self._end_tokens[k] = v.end_token else: raise ValueError( f"Attempted to set element of TokenSpanArray with " f"an object of type {type(value)}; current set of " f"allowed types is {(TokenSpan, TokenSpanArray)}") self._clear_cached_properties()
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes): if index_ops.hasnans: raise ValueError( "Cannot convert %s with missing values to integer" % self.pretty_name ) elif is_bool_dtype(dtype) and not isinstance(dtype, extension_dtypes): if index_ops.hasnans: raise ValueError("Cannot convert %s with missing values to bool" % self.pretty_name) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when( index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), SF.lit(True), ).otherwise(index_ops.spark.column.cast(spark_type)) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(np.nan)) else: return _as_other_type(index_ops, dtype, spark_type)
def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: """ See docstring in `ExtensionArray` class in `pandas/core/arrays/base.py` for information about this method. """ key = check_array_indexer(self, key) if isinstance(value, ABCSeries) and isinstance(value.dtype, SpanDtype): value = value.values if value is None or isinstance(value, Sequence) and len(value) == 0: self._begins[key] = Span.NULL_OFFSET_VALUE self._ends[key] = Span.NULL_OFFSET_VALUE elif isinstance(value, Span) or \ ((isinstance(key, slice) or (isinstance(key, np.ndarray) and is_bool_dtype(key.dtype))) and isinstance(value, SpanArray)): self._begins[key] = value.begin self._ends[key] = value.end elif isinstance(key, np.ndarray) and len(value) > 0 and len(value) == len(key) and \ ((isinstance(value, Sequence) and isinstance(value[0], Span)) or isinstance(value, SpanArray)): for k, v in zip(key, value): self._begins[k] = v.begin self._ends[k] = v.end else: raise ValueError(f"Attempted to set element of SpanArray with " f"an object of type {type(value)}") # We just changed the contents of this array, so invalidate any cached # results computed from those contents. self.increment_version()
def series_dtype(s: pd.Series) -> VarType: """ Computes the type of a pandas series. Parameters ---------- s : pd.Series The series for which we wish to determine the type. Returns ------- VarType """ if is_bool_dtype(s): return VarType.TYPE_CAT elif is_string_dtype(s): return VarType.TYPE_CAT elif is_categorical_dtype(s): return VarType.TYPE_CAT elif is_numeric_dtype(s): if numeric_is_continuous(s): return VarType.TYPE_NUM else: return VarType.TYPE_CAT else: return VarType.TYPE_UNSUPPORTED
def test_is_bool_dtype(data): assert is_bool_dtype(data) assert pd.core.common.is_bool_indexer(data) s = pd.Series(range(len(data))) result = s[data] expected = s[np.asarray(data)] tm.assert_series_equal(result, expected)
def __getitem__(self, item): if isinstance(item, tuple): if len(item) > 1: if item[0] is Ellipsis: item = item[1:] elif item[-1] is Ellipsis: item = item[:-1] if len(item) > 1: raise IndexError("too many indices for array.") item = item[0] if isinstance(item, numbers.Integral): return self.data[item] elif isinstance(item, slice) and item == slice(None): # Make sure we get a view return type(self)(self.data) elif isinstance(item, slice): # slice return type(self)(self.data[item]) else: item = pd.api.indexers.check_array_indexer(self, item) if is_bool_dtype(item.dtype): return self._from_sequence( [x for x, m in zip(self, item) if m]) # integer return type(self)([self.data[i] for i in item])
def _infer_task(df, x, y): "Returns str with the name of the inferred task based on the columns x and y" if x == y: return "predict_itself" category_count = df[y].value_counts().count() if category_count == 1: return "predict_constant" if category_count == 2: return "classification" if category_count == len(df[y]) and ( is_string_dtype(df[y]) or is_categorical_dtype(df[y]) ): return "predict_id" if category_count <= NUMERIC_AS_CATEGORIC_BREAKPOINT and is_numeric_dtype(df[y]): return "classification" if is_bool_dtype(df[y]) or is_string_dtype(df[y]) or is_categorical_dtype(df[y]): return "classification" if is_datetime64_any_dtype(df[y]) or is_timedelta64_dtype(df[y]): raise Exception( f"The target column {y} has the dtype {df[y].dtype} which is not supported. A possible solution might be to convert {y} to a string column" ) # this check needs to be after is_bool_dtype because bool is considered numeric by pandas if is_numeric_dtype(df[y]): return "regression" raise Exception( f"Could not infer a valid task based on the target {y}. The dtype {df[y].dtype} is not yet supported" ) # pragma: no cover
def series_is_boolean(col: pd.Series or pd.Index): """ returns: None if column is all None; True if a pd.Series contains True, False, and None; False otherwise caveat: does not interpret all-zero or all-one columns as boolean""" if len(col.unique()) == 1 and col.unique()[0] is None: # return None for all-None columns return None elif col.isna().all(): return None elif is_bool_dtype(col): return True elif is_object_dtype(col): for val in col.unique(): if val not in [True, False, None]: return False return False in col.unique() and True in col.unique() elif is_integer_dtype(col) or is_float_dtype(col): for val in col.unique(): if pd.isna(val): continue if val not in [1, 0, None]: return False if 0 not in col.unique() or 1 not in col.unique(): return False return True return False
def to_data_table(data: pd.DataFrame): """ Create a BOOM DataTable object from a pandas DataFrame. The categories of any categorical variables will be handled as strings. """ dtypes = data.dtypes ans = boom.DataTable() for i in range(data.shape[1]): dt = dtypes[i] vname = data.columns[i] if is_numeric_dtype(dt) or is_bool_dtype(dt): ans.add_numeric(boom.Vector(data.iloc[:, i].values.astype("float")), vname) elif is_categorical_dtype(dt): x = data.iloc[:, i] values = x.cat.codes codes = x.cat.categories ans.add_categorical(values, codes, vname) elif is_object_dtype(dt): labels = data.iloc[:, i].astype("str") ans.add_categorical_from_labels(labels.values, vname) else: raise Exception( f"Only numeric or categorical data are supported. " f"Column {i} ({data.columns[i]}) has dtype {dt}." ) return ans
def update_attributes(self, column): if is_bool_dtype(column): col_type = 'logical' elif is_numeric_dtype(column): col_type = 'numeric' elif is_object_dtype(column): col_type = 'categorical' else: raise Exception('Column type is not supported') if self.type is None: self.type = col_type elif self.type != col_type: raise Exception( 'The same name was used for columns with different type') if self.type == 'numeric': self.min = min(column.min(), math.inf if self.min is None else self.min) self.max = max(column.max(), -math.inf if self.max is None else self.max) if len(column.unique()) < 20: self.levels = np.unique((self.levels or []) + column.unique().tolist()).tolist() if not self.levels is None and len(self.levels) >= 20: self.levels = None else: self.levels = np.unique((self.levels or []) + column.unique().tolist()).tolist()
def contains_op(cls, series: pd.Series, state: dict) -> bool: is_valid_dtype = pdt.is_categorical_dtype( series) and not pdt.is_bool_dtype(series) if is_valid_dtype: return True return series_is_string(series)
def check_if_series_has_internal_type(series, internal_type): """Check if data type of series fits to the internal type of gettsim. Parameters ---------- series : pd.Series Some data series. internal_type : TypeVar One of the internal gettsim types. Returns ------- out : bool Return check variable. """ if internal_type == FloatSeries: out = is_float_dtype(series) or is_integer_dtype(series) elif internal_type == BoolSeries: out = is_bool_dtype(series) elif internal_type == IntSeries: out = is_integer_dtype(series) elif internal_type == DateTimeSeries: out = is_datetime64_any_dtype(series) else: raise ValueError(f"The internal type {internal_type} is not defined.") return out
def test_above_100_datatype(self): for file in self.list_of_output_files: file_path = os.path.join(self.output_path, file) data = pd.read_csv(file_path) above_100_datatype = data['above_100'] self.assertTrue(is_bool_dtype(above_100_datatype))
def find_atoms(self, data: pd.DataFrame): """ Find the numeric atoms and categorical levels to be modeled. """ self._dtypes = data.dtypes atoms_dict = {} levels_dict = {} for i in range(data.shape[1]): vname = data.columns[i] dt = self._dtypes[i] if is_numeric_dtype(dt): variable = data.iloc[:, i] counts = variable.value_counts().sort_values(ascending=False) number_observed = counts.sum() atom_indicator = counts > 0.05 * number_observed atoms = counts[atom_indicator].index.tolist() if len(atoms) > 3: atoms = atoms[:3] atoms_dict[vname] = atoms self._numeric_colnames.append(data.columns[i]) elif (is_categorical_dtype(dt) or is_object_dtype(dt) or is_bool_dtype(dt)): # TODO: put in some cardinality protections. levels = data.iloc[:, i].value_counts() levels_dict[vname] = levels self._categorical_colnames.append(data.columns[i]) else: raise Exception( "Only categorical or numeric types are supported.") return atoms_dict, levels_dict
def variable_type(self): if is_numeric_dtype(self.data) and not is_bool_dtype(self.data): # Only int and float types return 'numeric' else: # Handle bool, string, datetime, etc as categorical self.data = self.data.astype('category') return 'categorical'
def contains_op(cls, series: pd.Series, state: dict) -> bool: if pdt.is_object_dtype(series): try: return series.isin({True, False}).all() except: return False return pdt.is_bool_dtype(series)
def get_formatter(dtype): if types.is_datetime64_any_dtype(dtype): return DateFormatter(format="%Y-%m-%d %H:%M:%S.%N") elif types.is_bool_dtype(dtype): return None # return BooleanFormatter() else: return None
def _dtype_represents_categories(series) -> bool: "Determines if the dtype of the series represents categorical values" return ( is_bool_dtype(series) or is_object_dtype(series) or is_string_dtype(series) or is_categorical_dtype(series) )
def test_search(self, mp_wfs, mp_remote_describefeaturetype, mp_remote_md, mp_remote_fc, mp_remote_wfs_feature, mp_dov_xml): """Test the search method with only the query parameter. Test whether the result is correct. Parameters ---------- mp_wfs : pytest.fixture Monkeypatch the call to the remote GetCapabilities request. mp_remote_describefeaturetype : pytest.fixture Monkeypatch the call to a remote DescribeFeatureType. mp_remote_md : pytest.fixture Monkeypatch the call to get the remote metadata. mp_remote_fc : pytest.fixture Monkeypatch the call to get the remote feature catalogue. mp_remote_wfs_feature : pytest.fixture Monkeypatch the call to get WFS features. mp_dov_xml : pytest.fixture Monkeypatch the call to get the remote XML data. """ df = self.get_search_object().search( query=self.get_valid_query_single()) assert type(df) is DataFrame assert list(df) == self.get_df_default_columns() datatype = self.get_type() allfields = datatype.get_field_names() ownfields = datatype.get_field_names(include_subtypes=False) subfields = [f for f in allfields if f not in ownfields] assert len(df) >= 1 for field in list(df): if field in ownfields: assert len(df[field].unique()) == 1 elif field in subfields: assert len(df[field].unique()) >= 1 # dtype checks of the resulting df columns fields = self.get_type().get_fields(source=('wfs', 'xml', 'custom')) for field in list(df): datatype = fields[field]['type'] if datatype == 'string': assert (is_object_dtype(df[field]) or df[field].isnull().values.all()) # all Nan/None elif datatype == 'float': assert is_float_dtype(df[field]) elif datatype == 'integer': assert is_int64_dtype(df[field]) elif datatype == 'date': assert is_object_dtype(df[field]) elif datatype == 'boolean': assert is_bool_dtype(df[field])
def cond_ind(self, a_col_name, b_col_name, cond_col_names, alpha): a = self.df[a_col_name] b = self.df[b_col_name] conds = [self.df[z] for z in cond_col_names] if is_bool_dtype(a) and is_bool_dtype(b) and all([is_bool_dtype(z) for z in conds]): stat, pval = cmh(a, b, conds) elif (is_categorical_dtype(a) or is_bool_dtype(a)) and \ (is_categorical_dtype(b) or is_bool_dtype(b)) and \ all([is_categorical_dtype(z) or is_bool_dtype(z) for z in conds]): stat, pval = chisq_3d(a, b, conds) elif is_numeric_dtype(a) and \ is_numeric_dtype(b) and \ all([is_categorical_dtype(z) or is_bool_dtype(z) for z in conds]): stat, pval = blocked_pearson(a, b, conds) elif is_numeric_dtype(a) and \ is_numeric_dtype(b) and \ all([is_numeric_dtype(z) for z in conds]): stat, pval = partial_corr_r(a, b, conds) else: stat, pval = freak out return pval < alpha
def derive_pack_func(col): if pdtypes.is_string_dtype(col): return pack_string_func elif pdtypes.is_bool_dtype(col): return pack_bool_func elif pdtypes.is_numeric_dtype(col): return pack_numeric_func else: return pack_null_func
def is_nullable_dtype(dtype: Any) -> bool: """Wether dtype is a pandas nullable type.""" from pandas.api.types import is_integer_dtype, is_bool_dtype # dtype: pd.core.arrays.numeric.NumericDtype nullable_alias = {"Int16", "Int32", "Int64"} is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`. is_bool = is_bool_dtype(dtype) and dtype.name == "boolean" return is_int or is_bool
def universal_dataset_check(self, dataset_name, object_headers=None, numeric_headers=None, bool_headers=None, test_func=None): # "Hard" integrity checks that take a long time. # These tests only run if the MATMINER_DATASET_FULL_TEST # environment variable is set to True if do_complete_test: # Get rid of dataset if it's on the disk already data_path = os.path.join( self.dataset_dir, dataset_name + "." + self.dataset_dict[dataset_name][ 'file_type' ] ) if os.path.exists(data_path): os.remove(data_path) # Test that dataset can be downloaded load_dataset(dataset_name) self.assertTrue(os.path.exists(data_path)) # Test that data is now available and has all its elements df = load_dataset(dataset_name, download_if_missing=False) self.assertEqual( len(df), self.dataset_dict[dataset_name]["num_entries"] ) # Test all columns are there self.assertEqual(sorted(list(df)), sorted( [header for header in self.dataset_dict[dataset_name]['columns'].keys()] )) # Test each column for appropriate type if object_headers is None: object_headers = [] if numeric_headers is None: numeric_headers = [] if bool_headers is None: bool_headers = [] df = load_dataset(dataset_name, download_if_missing=False) if object_headers: self.assertTrue(is_object_dtype(df[object_headers].values)) if numeric_headers: self.assertTrue(is_numeric_dtype(df[numeric_headers].values)) if bool_headers: self.assertTrue(is_bool_dtype(df[bool_headers].values)) # Make sure all columns are accounted for column_headers = object_headers + numeric_headers + bool_headers self.assertEqual(sorted(list(df)), sorted(column_headers)) # Run tests unique to the dataset if test_func is not None: test_func(df) # "Soft" check that just makes sure the dataset download page is active # This runs when on a system with the CI environment var present # (e.g. when running a continuous integration VCS system) else: download_page = requests.head( self.dataset_dict[dataset_name]["url"] ) self.assertTrue(download_page.ok)