def test_cload_field(): runner = CliRunner() with runner.isolated_filesystem(): extra_args = ['--field', 'score=7'] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler('toy.2.cool').pixels()[:] assert 'count' in pixels.columns and types.is_integer_dtype( pixels.dtypes['count']) assert 'score' in pixels.columns and types.is_float_dtype( pixels.dtypes['score']) extra_args = ['--field', 'count=7'] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler('toy.2.cool').pixels()[:] assert 'count' in pixels.columns and types.is_integer_dtype( pixels.dtypes['count']) assert np.allclose(pixels['count'][:], 0) extra_args = ['--field', 'count=7:dtype=float'] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler('toy.2.cool').pixels()[:] assert 'count' in pixels.columns and types.is_float_dtype( pixels.dtypes['count']) assert np.allclose(pixels['count'][:], 0.2) extra_args = ['--field', 'count=7:agg=min,dtype=float'] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler('toy.2.cool').pixels()[:] assert 'count' in pixels.columns and types.is_float_dtype( pixels.dtypes['count']) assert np.allclose(pixels['count'][:], 0.1)
def test_dataframe(): d2 = DirectAccessV2( api_key=DIRECTACCESS_API_KEY, client_id=DIRECTACCESS_CLIENT_ID, client_secret=DIRECTACCESS_CLIENT_SECRET, access_token=DIRECTACCESS_TOKEN, ) df = d2.to_dataframe("rigs", pagesize=10000, deleteddate="null") # Check index is set to API endpoint "primary key" assert df.index.name == "RigID" # Check datetime64 dtypes assert is_datetime64_ns_dtype(df.CreatedDate) assert is_datetime64_ns_dtype(df.DeletedDate) assert is_datetime64_ns_dtype(df.SpudDate) assert is_datetime64_ns_dtype(df.UpdatedDate) # Check Int64 dtypes assert is_int64_dtype(df.PermitDepth) assert is_int64_dtype(df.FormationDepth) # Check float dtypes assert is_float_dtype(df.RigLatitudeWGS84) assert is_float_dtype(df.RigLongitudeWGS84) return
def test_cload_field(): runner = CliRunner() with runner.isolated_filesystem(): extra_args = ["--field", "score=8"] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler("toy.2.cool").pixels()[:] assert "count" in pixels.columns and types.is_integer_dtype( pixels.dtypes["count"]) assert "score" in pixels.columns and types.is_float_dtype( pixels.dtypes["score"]) extra_args = ["--field", "count=8"] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler("toy.2.cool").pixels()[:] assert "count" in pixels.columns and types.is_integer_dtype( pixels.dtypes["count"]) assert np.allclose(pixels["count"][:], 0) extra_args = ["--field", "count=8:dtype=float"] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler("toy.2.cool").pixels()[:] assert "count" in pixels.columns and types.is_float_dtype( pixels.dtypes["count"]) assert np.allclose(pixels["count"][:], 0.2) extra_args = ["--field", "count=8:agg=min,dtype=float"] result = _run_cload_pairs(runner, 2, extra_args) assert result.exit_code == 0 pixels = cooler.Cooler("toy.2.cool").pixels()[:] assert "count" in pixels.columns and types.is_float_dtype( pixels.dtypes["count"]) assert np.allclose(pixels["count"][:], 0.1)
def _validate_fit_data(self): """Verifies that T, X, and y are formatted the right way""" # Checks for T column if not is_float_dtype(self.T): raise TypeError(f"Treatment data must be of type float") # Make sure all X columns are float or int if isinstance(self.X, pd.Series): if not is_numeric_dtype(self.X): raise TypeError( f"All covariate (X) columns must be int or float type (i.e. must be numeric)" ) elif isinstance(self.X, pd.DataFrame): for column in self.X: if not is_numeric_dtype(self.X[column]): raise TypeError( f"All covariate (X) columns must be int or float type " f"(i.e. must be numeric)") # Checks for Y column if not (is_float_dtype(self.y) or is_integer_dtype(self.y)): raise TypeError(f"Outcome data must be of type float or integer") if is_integer_dtype(self.y) and (not np.array_equal( np.sort(self.y.unique()), np.array([0, 1]))): raise TypeError( f"If your outcome data is of type integer (binary outcome)," f"it should only contain 1's and 0's.")
def coerce_dtypes(df, dtypes): """ Coerce dataframe to dtypes safely Operates in place Parameters ---------- df: Pandas DataFrame dtypes: dict like {'x': float} """ for c in df.columns: if c in dtypes and df.dtypes[c] != dtypes[c]: if is_float_dtype(df.dtypes[c]) and is_integer_dtype(dtypes[c]): # There is a mismatch between floating and integer columns. # Determine all mismatched and error. mismatched = sorted(c for c in df.columns if is_float_dtype(df.dtypes[c]) and is_integer_dtype(dtypes[c])) msg = ("Mismatched dtypes found.\n" "Expected integers, but found floats for columns:\n" "%s\n\n" "To fix, specify dtypes manually by adding:\n\n" "%s\n\n" "to the call to `read_csv`/`read_table`.\n\n" "Alternatively, provide `assume_missing=True` to " "interpret all unspecified integer columns as floats.") missing_list = '\n'.join('- %r' % c for c in mismatched) dtype_list = ('%r: float' % c for c in mismatched) missing_dict = 'dtype={%s}' % ',\n '.join(dtype_list) raise ValueError(msg % (missing_list, missing_dict)) df[c] = df[c].astype(dtypes[c])
def test_data_characters_types(): from pandas.api.types import is_object_dtype from pandas.api.types import is_float_dtype las = lasio.read(egfn('data_characters.las')) assert is_object_dtype(las.df().index.dtype) assert is_object_dtype(las.df()['DATE'].dtype) assert is_float_dtype(las.df()['DEPT'].dtype) assert is_float_dtype(las.df()['ARC_GR_UNC_RT'].dtype)
def test_data_characters_types(): from pandas.api.types import is_object_dtype from pandas.api.types import is_float_dtype las = lasio.read(egfn("data_characters.las")) assert is_object_dtype(las.df().index.dtype) assert is_object_dtype(las.df()["DATE"].dtype) assert is_float_dtype(las.df()["DEPT"].dtype) assert is_float_dtype(las.df()["ARC_GR_UNC_RT"].dtype)
def _validate_fit_data(self): """Verifies that T, M, and y are formatted the right way""" # Checks for T column if not is_float_dtype(self.T): raise TypeError(f"Treatment data must be of type float") # Checks for M column if not is_float_dtype(self.M): raise TypeError(f"Mediation data must be of type float") # Checks for Y column if not is_float_dtype(self.y): raise TypeError(f"Outcome data must be of type float")
def guess_natsort_alg(cls, dtype: Type[Any]) -> NatsortFlagsAndValue: """ Guesses a good natsorted flag for the dtype. Here are some specifics: - integers ⇒ INT and SIGNED - floating-point ⇒ FLOAT and SIGNED - strings ⇒ COMPATIBILITYNORMALIZE and GROUPLETTERS - datetime ⇒ GROUPLETTERS (only affects 'Z' vs. 'z'; shouldn't matter) Args: dtype: Probably from ``pd.Series.dtype`` Returns: A tuple of (set of flags, int) -- see :meth:`exact_natsort_alg` """ st, x = set(), 0 if is_string_dtype(dtype): st.update(["COMPATIBILITYNORMALIZE", "GROUPLETTERS"]) x |= ns_enum.COMPATIBILITYNORMALIZE | ns_enum.GROUPLETTERS elif is_categorical_dtype(dtype): pass elif is_integer_dtype(dtype) or is_bool_dtype(dtype): st.update(["INT", "SIGNED"]) x |= ns_enum.INT | ns_enum.SIGNED elif is_float_dtype(dtype): st.update(["FLOAT", "SIGNED"]) x |= ns_enum.FLOAT | ns_enum.SIGNED # same as ns_enum.REAL return NatsortFlagsAndValue(st, x)
def series_is_boolean(col: pd.Series or pd.Index): """ returns: None if column is all None; True if a pd.Series contains True, False, and None; False otherwise caveat: does not interpret all-zero or all-one columns as boolean""" if len(col.unique()) == 1 and col.unique()[0] is None: # return None for all-None columns return None elif col.isna().all(): return None elif is_bool_dtype(col): return True elif is_object_dtype(col): for val in col.unique(): if val not in [True, False, None]: return False return False in col.unique() and True in col.unique() elif is_integer_dtype(col) or is_float_dtype(col): for val in col.unique(): if pd.isna(val): continue if val not in [1, 0, None]: return False if 0 not in col.unique() or 1 not in col.unique(): return False return True return False
def metric(): try: file = request.files["file"] extension = f.filename.split(".")[-1] if not file or extension not in app.config["ALLOWED_FILE_EXTENSIONS"]: return 'Invalid' data = io.BytesIO() file.save(data) data = data.getvalue().decode('utf-8') data = io.StringIO(data) df_pred = pd.read_csv(data) for col in ['name', *FIELDS]: if col not in df_pred.columns: return 'Invalid: Lack of column {}'.format(col) if not (df_pred.name == df_true.name).all(): return 'Invalid: names are not correct' if not all(is_float_dtype(df_pred[field]) for field in FIELDS): return 'Invalid: Expect data to be float' y_pred = df_pred[FIELDS].values.reshape(-1, 4, 2) rmse = np.sqrt(((y_true - y_pred) ** 2).sum(axis=2).mean()).item() print(rmse) return jsonify({ 'rmse': round(rmse, 3), }) except: return jsonify({ "error_msg": traceback.format_exc() })
def test_cload_field(bins_path, pairs_path): kwargs = dict( metadata=None, assembly="toy", chunksize=10, zero_based=False, comment_char="#", input_copy_status="unique", no_symmetric_upper=False, temp_dir=None, no_delete_temp=False, storage_options=None, no_count=True, max_merge=200, chrom1=2, pos1=3, chrom2=4, pos2=5, ) cload_pairs.callback(bins_path, pairs_path, testcool_path, field=("score=8:dtype=float", ), **kwargs) pixels = cooler.Cooler(testcool_path).pixels()[:] assert "count" in pixels.columns and types.is_integer_dtype( pixels.dtypes["count"]) assert "score" in pixels.columns and types.is_float_dtype( pixels.dtypes["score"])
def prepare_data( dataset_df: pd.DataFrame, drop_na: bool = False, mean_int: bool = True, mean_float: bool = True, rescale_float: bool = True, standardize_float: bool = True, ) -> None: """Fill missing values and standardize float columns. :author: Robin Courant :param dataset_df: dataset to process. :param drop_na: whether to drop every row with at least on `NaN` cell. :param mean_int: whether to use mean or the median for missing integers. :param mean_float: whether to use mean or the median for missing floats. :param rescale_float: whether to rescale floats (standardize or normalize). :param standardize_float: whether to apply standardization or normalization. """ if drop_na: dataset_df.dropna() return for column_name, column_series in dataset_df.iteritems(): if is_integer_dtype(column_series): if set(column_series.unique()) == {0, 1}: dataset_df[column_name] = _prepare_bool(column_series) else: dataset_df[column_name] = _prepare_int(column_series, mean_int) elif is_float_dtype(column_series): dataset_df[column_name] = _prepare_float(column_series, mean_float, rescale_float, standardize_float) # Raise an error is the column's type is not boolean, integer or float else: raise TypeError(f"Unrecognized type, column: {column_name}")
def test_time_formater_is_float(self): """test that the expected time format is created on a given dataframe as a float variable""" result1 = time_formater(self.df_gr) result_1 = self.df_gr_time self.assertIsInstance(result1, pd.DataFrame) self.assertTrue(ptypes.is_float_dtype(result1["time_hours"])) self.assertEqual(result1, result_1)
def check_if_series_has_internal_type(series, internal_type): """Check if data type of series fits to the internal type of gettsim. Parameters ---------- series : pd.Series Some data series. internal_type : TypeVar One of the internal gettsim types. Returns ------- out : bool Return check variable. """ if internal_type == FloatSeries: out = is_float_dtype(series) or is_integer_dtype(series) elif internal_type == BoolSeries: out = is_bool_dtype(series) elif internal_type == IntSeries: out = is_integer_dtype(series) elif internal_type == DateTimeSeries: out = is_datetime64_any_dtype(series) else: raise ValueError(f"The internal type {internal_type} is not defined.") return out
def __check_data(self): """ Check input data type and frequency. Also checks the data prerequisites, defined in the models. """ if not (ptypes.is_datetime64_any_dtype(self.data.index) or ptypes.is_period_dtype(self.data.index)): raise TypeError( 'Input data index should be datetime or period object. Received : {} instead.' .format(self.data.index.dtypes)) if ptypes.is_period_dtype(self.data.index): self.data.index = self.data.index.to_timestamp() for prerequisite in self.Model.input_requirements: if prerequisite.name not in self.data: raise ValueError( 'Input data should contains "{}" data! Keyword "{}" not found.' .format(prerequisite.name, prerequisite.name)) elif not ptypes.is_float_dtype(self.data[prerequisite.name]): raise ValueError( 'Input data "{}" should be float! Currently : {}'.format( prerequisite.name, self.data[prerequisite].dtypes)) self.__check_for_na_in_inputs(prerequisite.name) if prerequisite.positive: self.__check_for_negative_values_in_inputs(prerequisite.name)
def _default_transformer(col, train_df): if is_integer_dtype(train_df[col]): return int if is_float_dtype(train_df[col]): return float if is_string_dtype(train_df[col]): return LabelEncoder
def load_metadata(self): self._ensure_loaded() dtypes = self.dataset._dtypes to_feature_type = lambda dt: ( 'int' if pat.is_integer_dtype(dt) else 'float' if pat.is_float_dtype(dt) else 'number' if pat.is_numeric_dtype(dt) else 'category' if pat.is_categorical_dtype(dt) else 'string' if pat.is_string_dtype(dt) # else 'datetime' if pat.is_datetime64_dtype(dt) else 'object') features = [ Feature(i, col, to_feature_type(dtypes[i])) for i, col in enumerate(self._ds.columns) ] for f in features: col = self._ds.iloc[:, f.index] f.has_missing_values = col.hasnans if f.is_categorical(): f.values = sorted(self._ds.dtypes[f.name].categories.values) target = self._find_target_feature(features) self._set_feature_as_target(target) meta = dict(features=features, target=target) log.debug("Metadata for dataset %s: %s", self.path, meta) return meta
def _check_op(self, s, op_name, other, exc=None): op = self.get_op_from_name(op_name) result = op(s, other) # compute expected mask = s.isna() # other array is an Integer if isinstance(other, IntegerArray): omask = getattr(other, 'mask', None) mask = getattr(other, 'data', other) if omask is not None: mask |= omask # float result type or float op if ((is_float_dtype(other) or is_float(other) or op_name in ['__rtruediv__', '__truediv__', '__rdiv__', '__div__'])): rs = s.astype('float') expected = op(rs, other) self._check_op_float(result, expected, mask, s, op_name, other) # integer result type else: rs = pd.Series(s.values._data) expected = op(rs, other) self._check_op_integer(result, expected, mask, s, op_name, other)
def categorical_func(series): natural_language_threshold = ww.config.get_option( 'natural_language_threshold') numeric_categorical_threshold = ww.config.get_option( 'numeric_categorical_threshold') if pdtypes.is_string_dtype(series.dtype) and not col_is_datetime(series): # heuristics to predict this some other than categorical sample = series.sample(min(10000, len(series))) # catch cases where object dtype cannot be interpreted as a string try: avg_length = sample.str.len().mean() if avg_length > natural_language_threshold: return False except AttributeError: pass return True if pdtypes.is_categorical_dtype(series.dtype): return True if ((pdtypes.is_float_dtype(series.dtype) or pdtypes.is_integer_dtype(series.dtype)) and _is_numeric_categorical(series, numeric_categorical_threshold)): return True return False
def test_cload_field(bins_path, pairs_path): kwargs = dict( metadata=None, assembly='toy', chunksize=10, zero_based=False, comment_char='#', input_copy_status='unique', no_symmetric_upper=False, temp_dir=None, no_delete_temp=False, storage_options=None, no_count=True, max_merge=200, chrom1=1, pos1=2, chrom2=3, pos2=4, ) cload_pairs.callback(bins_path, pairs_path, testcool_path, field=('score=7:dtype=float', ), **kwargs) pixels = cooler.Cooler(testcool_path).pixels()[:] assert 'count' in pixels.columns and types.is_integer_dtype( pixels.dtypes['count']) assert 'score' in pixels.columns and types.is_float_dtype( pixels.dtypes['score'])
def double_func(series): numeric_categorical_threshold = ww.config.get_option( 'numeric_categorical_threshold') if (pdtypes.is_float_dtype(series.dtype) and not _is_numeric_categorical( series, numeric_categorical_threshold)): return True return False
def contains_op(self, series): if not pdt.is_float_dtype(series): return False elif series in tenzing_integer: return False else: return True
def dispatch(col): dfcol = df[col.factor] if type(col) == IndicatorCol: assert is_categorical_dtype(dfcol) return (dfcol == col.level).to_numpy() elif type(col) == NumericCol: assert is_float_dtype(dfcol) or is_integer_dtype(dfcol) return dfcol.to_numpy() elif type(col) == CustomCol: assert col.factor in contrasts mat = contrasts[col.factor] levels = metadata.column(col.factor).levels # TODO: This can be triggered in normal use, so turn into # friendly error. It probably makes sense to check for # this earlier. This could possibly happen in `defm` after # creating the metadata, though this would also require a # separate check for SequentialOED. Is there anywhere # sensible to put this that catches both? assert len(levels) == mat.shape[0] assert col.index < mat.shape[1] # TODO: Better asymptotics then using `.index()` out = mat[[levels.index(val) for val in dfcol], col.index] return out else: raise Exception('unknown column type')
def categorical_bar_graph(self, params=None): df_parameters = self._dataframe_parameters() if not params: params = list(df_parameters) for item in params: if is_float_dtype(df_parameters[item]) or is_integer_dtype( df_parameters[item]): continue else: # ToDO: fix # this is broken for i in range(len(df_parameters[item].index)): if type(df_parameters[item][i]) == str or type( df_parameters[item][i]) == bool: continue else: # ToDO: enhancement #this is where I could potentially make it so that an additional bar said numerical df_parameters[item].drop(i) try: plt.figure(figsize=(20, 8)) plt.rcParams['font.size'] = 18 sns.countplot(x=item, data=df_parameters, palette="husl").set_title( '{} bar graph'.format(item)) path = self._save_to_folder('/bar_parameters', '{}_bar_graph.pdf'.format(item)) plt.savefig(path) plt.close('all') except: continue
def categorical_evolution(self, params=None): df_parameters = self._dataframe_parameters() if not params: params = list(df_parameters) df_parameters['iteration'] = self.results['iteration'] for item in params: if is_float_dtype(df_parameters[item]) or is_integer_dtype( df_parameters[item]): continue else: for i in range(len(df_parameters[item].index)): if type(df_parameters[item][i]) == str or type( df_parameters[item][i]) == bool: continue else: df_parameters[item] = df_parameters[item].drop(i) try: plt.figure(figsize=(20, 8)) plt.rcParams['font.size'] = 18 sns.catplot(data=df_parameters, x='iteration', y=item).fig.suptitle( '{} over iterations'.format(item)) path = self._save_to_folder( '/category_evolution', '{}_category_iter_graph.pdf'.format(item)) plt.savefig(path) plt.close('all') except: continue
def load_external_csv_dataset(filename, split_percent, algo_type): try: filepath = os.path.join(media_root, filename) df = pd.read_csv(filepath) cols = list(df.columns) last = cols[-1] except Exception as e: return f'{e}', None, None, None, None, None if df.isnull().values.any(): return "NAN VALUES IN DATASET", None, None, None, None, None #check whether all input data is numeric or not for column in cols[:-1]: if not is_numeric_dtype(df[column]): return "NON NUMERIC VALUE FOUND IN DATASET (EXCLUDING OUTPUT COLUMN)", None, None, None, None, None if is_float_dtype(df[last]) and algo_type == "Classification": return "OUTPUT COLUMN DOES NOT CONTAIN CATEGORICAL VALUES", None, None, None, None, None if not is_float_dtype(df[last]) and algo_type == "Regression": return "OUTPUT COLUMN DOES NOT CONTAIN CONTINUOUS NUMERIC VALUES", None, None, None, None, None status = "VALID DATASET" if split_percent < 0: train_output = df[last] train_input = df.drop([last], axis=1) test_input = train_input.copy() test_output = train_output.copy() return status, train_input, train_output, test_input, test_output, last #randomly shuffle the data first np.random.seed(2) df = df.sample(frac=1).reset_index(drop=True) #splitting the training data and testing data test_data_size = (df.shape[0] * split_percent) // 100 r_no = np.random.randint(df.shape[0] - test_data_size) test_data = df[r_no:r_no + test_data_size] train_data = df.drop(range(r_no, r_no + test_data_size)) #splitting ouptut and input train_output = train_data[last] test_output = test_data[last] train_input = train_data.drop([last], axis=1) test_input = test_data.drop([last], axis=1) return status, train_input, train_output, test_input, test_output, last
def handle_categorical(df: pd.DataFrame, columns: frozenset) -> None: cols = frozenset(df.columns) & columns cols = [ col for col, dtype in zip(cols, df[cols].dtypes) if not is_float_dtype(dtype) ] if cols: df[cols] = df[cols].astype("category")
def save_as_spss(data_frame: pandas.DataFrame, out_path: str, labels: dict = None, find=None, repl=None) -> None: """ caastools.utils.save_as_spss(data_frame: pandas.DataFrame, out_path: str) -> None saves data_frame as an SPSS dataset at out_path :param data_frame: the pandas DataFrame to save :param out_path: the path at which to save the file :param labels: a dictionary mapping column labels in the data frame to a variable label in the SPSS dataset :param find: a sequence of characters within variable names to be replaced with other values. Default None :param repl: a sequence of characters with which to replace corresponding entries in find, or a function which yields their replacements. Default None :return: None :raise ValueError: if either find/repl is None and the other is not :raise ValueError: if find and repl are sequences of unequal length """ cols = data_frame.columns # type: pandas.Index is_multi_index = isinstance(cols, pandas.MultiIndex) var_names = [] var_types = {} var_formats = {} var_labels = {} if labels is None else labels # Construct the various information that the SPSS dictionary will contain about each variable for col in cols: var_name = sanitize_for_spss(".".join( str(i) for i in col) if is_multi_index else str(col), find=find, repl=repl) var_names.append(var_name) # Need to know the data type and format of each column so that the SPSS file can be written properly # 0 is a numeric type, any positive integer is a string type where the number represents the number # of bytes the string can hold. if pandas.api.types.is_string_dtype(data_frame[col]): lens = list( filter(lambda x: pandas.notna(x) and x is not None, set(data_frame[col].str.len()))) var_types[var_name] = int(max(lens)) * 2 if len(lens) > 0 else 255 else: var_types[var_name] = 0 var_formats[var_name] = "F10.2" if ptypes.is_float_dtype(data_frame[col].dtype) else \ "ADATE8" if ptypes.is_datetime64_any_dtype(data_frame[col]) else \ "F12.0" # Sometimes savReaderWriter has trouble writing a whole dataframe in at once, # Writing row by row seems to work without issue with SavWriter(out_path, var_names, var_types, formats=var_formats, varLabels=var_labels, ioUtf8=True) as writer: for row in data_frame.index: writer.writerow(data_frame.loc[row, :].values)
def test_search(self, mp_wfs, mp_remote_describefeaturetype, mp_remote_md, mp_remote_fc, mp_remote_wfs_feature, mp_dov_xml): """Test the search method with only the query parameter. Test whether the result is correct. Parameters ---------- mp_wfs : pytest.fixture Monkeypatch the call to the remote GetCapabilities request. mp_remote_describefeaturetype : pytest.fixture Monkeypatch the call to a remote DescribeFeatureType. mp_remote_md : pytest.fixture Monkeypatch the call to get the remote metadata. mp_remote_fc : pytest.fixture Monkeypatch the call to get the remote feature catalogue. mp_remote_wfs_feature : pytest.fixture Monkeypatch the call to get WFS features. mp_dov_xml : pytest.fixture Monkeypatch the call to get the remote XML data. """ df = self.get_search_object().search( query=self.get_valid_query_single()) assert type(df) is DataFrame assert list(df) == self.get_df_default_columns() datatype = self.get_type() allfields = datatype.get_field_names() ownfields = datatype.get_field_names(include_subtypes=False) subfields = [f for f in allfields if f not in ownfields] assert len(df) >= 1 for field in list(df): if field in ownfields: assert len(df[field].unique()) == 1 elif field in subfields: assert len(df[field].unique()) >= 1 # dtype checks of the resulting df columns fields = self.get_type().get_fields(source=('wfs', 'xml', 'custom')) for field in list(df): datatype = fields[field]['type'] if datatype == 'string': assert (is_object_dtype(df[field]) or df[field].isnull().values.all()) # all Nan/None elif datatype == 'float': assert is_float_dtype(df[field]) elif datatype == 'integer': assert is_int64_dtype(df[field]) elif datatype == 'date': assert is_object_dtype(df[field]) elif datatype == 'boolean': assert is_bool_dtype(df[field])
def _check_op(self, s, op_name, other, exc=None): op = self.get_op_from_name(op_name) result = op(s, other) # compute expected mask = s.isna() # if s is a DataFrame, squeeze to a Series # for comparison if isinstance(s, pd.DataFrame): result = result.squeeze() s = s.squeeze() mask = mask.squeeze() # other array is an Integer if isinstance(other, IntegerArray): omask = getattr(other, 'mask', None) mask = getattr(other, 'data', other) if omask is not None: mask |= omask # 1 ** na is na, so need to unmask those if op_name == '__pow__': mask = np.where(s == 1, False, mask) elif op_name == '__rpow__': mask = np.where(other == 1, False, mask) # float result type or float op if ((is_float_dtype(other) or is_float(other) or op_name in ['__rtruediv__', '__truediv__', '__rdiv__', '__div__'])): rs = s.astype('float') expected = op(rs, other) self._check_op_float(result, expected, mask, s, op_name, other) # integer result type else: rs = pd.Series(s.values._data) expected = op(rs, other) self._check_op_integer(result, expected, mask, s, op_name, other)
def coerce_dtypes(df, dtypes): """ Coerce dataframe to dtypes safely Operates in place Parameters ---------- df: Pandas DataFrame dtypes: dict like {'x': float} """ bad_dtypes = [] bad_dates = [] errors = [] for c in df.columns: if c in dtypes and df.dtypes[c] != dtypes[c]: actual = df.dtypes[c] desired = dtypes[c] if is_float_dtype(actual) and is_integer_dtype(desired): bad_dtypes.append((c, actual, desired)) elif is_object_dtype(actual) and is_datetime64_any_dtype(desired): # This can only occur when parse_dates is specified, but an # invalid date is encountered. Pandas then silently falls back # to object dtype. Since `object_array.astype(datetime)` will # silently overflow, error here and report. bad_dates.append(c) else: try: df[c] = df[c].astype(dtypes[c]) except Exception as e: bad_dtypes.append((c, actual, desired)) errors.append((c, e)) if bad_dtypes: if errors: ex = '\n'.join("- %s\n %r" % (c, e) for c, e in sorted(errors, key=lambda x: str(x[0]))) exceptions = ("The following columns also raised exceptions on " "conversion:\n\n%s\n\n") % ex extra = "" else: exceptions = "" # All mismatches are int->float, also suggest `assume_missing=True` extra = ("\n\nAlternatively, provide `assume_missing=True` " "to interpret\n" "all unspecified integer columns as floats.") bad_dtypes = sorted(bad_dtypes, key=lambda x: str(x[0])) table = asciitable(['Column', 'Found', 'Expected'], bad_dtypes) dtype_kw = ('dtype={%s}' % ',\n' ' '.join("%r: '%s'" % (k, v) for (k, v, _) in bad_dtypes)) dtype_msg = ( "{table}\n\n" "{exceptions}" "Usually this is due to dask's dtype inference failing, and\n" "*may* be fixed by specifying dtypes manually by adding:\n\n" "{dtype_kw}\n\n" "to the call to `read_csv`/`read_table`." "{extra}").format(table=table, exceptions=exceptions, dtype_kw=dtype_kw, extra=extra) else: dtype_msg = None if bad_dates: also = " also " if bad_dtypes else " " cols = '\n'.join("- %s" % c for c in bad_dates) date_msg = ( "The following columns{also}failed to properly parse as dates:\n\n" "{cols}\n\n" "This is usually due to an invalid value in that column. To\n" "diagnose and fix it's recommended to drop these columns from the\n" "`parse_dates` keyword, and manually convert them to dates later\n" "using `dd.to_datetime`.").format(also=also, cols=cols) else: date_msg = None if bad_dtypes or bad_dates: rule = "\n\n%s\n\n" % ('-' * 61) msg = ("Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n\n" "%s" % (rule.join(filter(None, [dtype_msg, date_msg])))) raise ValueError(msg)