def format(blob, state: str): """ :param blob: A DataFrame of raw sites data :param state: The 2 digit state code, in string form. :return: A DataFrame wherein the fields of interest have been appropriately formatted; w.r.t. expected type """ # For 'TRIFID', this (a) removes leading & trailing spaces, and subsequently (b) replaces # empty cells with np.nan blob['TRIFID'] = blob['TRIFID'].str.strip().replace(to_replace='', value=np.nan) # Enforce type float for field in ['LATITUDE', 'LONGITUDE']: blob[field] = dd.to_numeric(arg=blob[field], errors='coerce') # State blob['STATEGEOID'] = state return blob
def test_to_numeric_on_dask_dataframe_series_with_meta(): s = pd.Series(["1.0", "2", -3, -5.1]) arg = from_pandas(s, npartitions=2) expected = pd.to_numeric(s) output = to_numeric(arg, meta=pd.Series([], dtype="float64")) assert output.dtype == "float64" assert isinstance(output, Series) assert list(output.compute()) == list(expected)
def format(self, blob: dd.DataFrame): data = blob.copy() for field in self.getstringfields(): data[field] = data[field].str.strip().fillna(value='') data['ZIP_CODE'] = data['ZIP_CODE'].replace(to_replace='[^0-9]', value='', regex=True) data['ZIP_CODE'] = data['ZIP_CODE'].replace(to_replace='', value='0', regex=False) data['ZIP_CODE'] = dd.to_numeric(arg=data.ZIP_CODE, errors='coerce') return data
def convert_variable_data(df, column_id, new_type, **kwargs): """Convert dataframe's variable to different type. """ empty = df[column_id].empty if isinstance(df, pd.DataFrame) else False if empty: return df if new_type == vtypes.Numeric: if isinstance(df, dd.DataFrame): df[column_id] = dd.to_numeric(df[column_id], errors='coerce') elif is_instance(df, ks, 'DataFrame'): df[column_id] = ks.to_numeric(df[column_id]) else: orig_nonnull = df[column_id].dropna().shape[0] df[column_id] = pd.to_numeric(df[column_id], errors='coerce') # This will convert strings to nans # If column contained all strings, then we should # just raise an error, because that shouldn't have # been converted to numeric nonnull = df[column_id].dropna().shape[0] if nonnull == 0 and orig_nonnull != 0: raise TypeError( "Attempted to convert all string column {} to numeric". format(column_id)) elif issubclass(new_type, vtypes.Datetime): format = kwargs.get("format", None) # TODO: if float convert to int? if isinstance(df, dd.DataFrame): df[column_id] = dd.to_datetime(df[column_id], format=format, infer_datetime_format=True) elif is_instance(df, ks, 'DataFrame'): df[column_id] = ks.to_datetime(df[column_id], format=format, infer_datetime_format=True) else: df[column_id] = pd.to_datetime(df[column_id], format=format, infer_datetime_format=True) elif new_type == vtypes.Boolean: map_dict = { kwargs.get("true_val", True): True, kwargs.get("false_val", False): False, True: True, False: False } # TODO: what happens to nans? df[column_id] = df[column_id].map(map_dict).astype(np.bool) elif not issubclass(new_type, vtypes.Discrete): raise Exception("Cannot convert column %s to %s" % (column_id, new_type)) return df
def remove_invalid(self, df): """ Method that removes invalid observations. Examples of invalid observations are string characters in numeric values, as well as unexpected target values Parameters ---------- df: Dask dataframe A dataframe for which invalid observations must be removed Returns ------- df: Dask dataframe A dataframe for which invalid observations are removed """ for num in self.numeric_features: df[num] = dd.to_numeric(df[num], errors='coerce') df = df[df[self.target_feature].isin(list((map(str, self.labels))))] return df
def remove_invalid_duplicates(self, df): """ Method that finds AND removes duplicated observations in a dataframe Parameters ---------- dataframe : Dask dataframe A dataframe for which duplicates must be removed Returns ---------- df : Dask dataframe A dataframe with duplicates, if any, are removed. """ for num in self.numeric_features: df[num] = dd.to_numeric(df[num], errors='coerce') df = df[df[self.target_feature].isin(list((map(str, self.labels))))] df = df.drop_duplicates() self.df = df return df
def coerce_code(v: dd.Series, codes: List[int]) -> dd.Series: # Set non-ints and unexpected codes to missing (-1) v = dd.to_numeric(v, errors="coerce") v = v.where(v.isin(codes), np.nan) return v.fillna(-1).astype("int8")
def test_to_numeric_on_dask_dataframe_dataframe_raises_error(): s = pd.Series(["1.0", "2", -3, -5.1]) df = pd.DataFrame({"a": s, "b": s}) arg = from_pandas(df, npartitions=2) with pytest.raises(TypeError, match="arg must be a list, tuple, dask."): to_numeric(arg)
def test_to_numeric_on_dask_array(): arg = from_array(["1.0", "2", -3, 5.1]) expected = np.array([1.0, 2.0, -3.0, 5.1]) output = to_numeric(arg) assert isinstance(output, Array) assert list(output.compute()) == list(expected)
def test_to_numeric_on_scalars(arg): output = to_numeric(arg) assert isinstance(output, Delayed) assert output.compute() == 5
columns = pd.read_csv(root_dir + file, nrows=0).columns column_dtypes = [ dtype_dict[x] for x in [column_dict[x] for x in list(columns)] ] #dtype_dummy_dict = dict(zip(columns, ['object']*len(columns))) #df = dd.read_csv(root_dir + file, dtype=dtype_dummy_dict, engine="python", encoding="utf-8", error_bad_lines=False) #df = dd.read_csv(root_dir + file, dtype=dtype_dummy_dict, delimiter=',', encoding='utf-8', header=0, quoting=csv.QUOTE_NONE, error_bad_lines=False) df = pd.read_csv(root_dir + file) df = dd.from_pandas(df, npartitions=4) for column in columns: if column_dict[column] in ["int", "float"]: df[column] = dd.to_numeric(df[column], errors='coerce') elif column_dict[column] == "date": df[column] = dd.to_datetime(df[column]) elif column_dict[column] == "bool": df[column] = df[column].apply(convert_bool, meta=(column, 'bool')) else: df[column] = df[column].apply(normalize_string, meta=(column, 'object')) df[column].astype(dtype_dict[column_dict[column]]) if column == "Project Essay": res = df.apply(get_polarity, analyzer=sentiment_analyzer, axis=1, result_type='expand',
def test_to_numeric_raises(): with pytest.raises(ValueError, match="invalid error value"): to_numeric("10", errors="invalid") with pytest.raises(KeyError, match="``meta`` is not allowed"): to_numeric("10", meta=pd.Series([], dtype="float64"))