Ejemplo n.º 1
0
    def format(blob, state: str):
        """
        :param blob:
            A DataFrame of raw sites data

        :param state:
            The 2 digit state code, in string form.

        :return:
            A DataFrame wherein the fields of interest have been appropriately
            formatted; w.r.t. expected type
        """

        # For 'TRIFID', this (a) removes leading & trailing spaces, and subsequently (b) replaces
        # empty cells with np.nan
        blob['TRIFID'] = blob['TRIFID'].str.strip().replace(to_replace='', value=np.nan)

        # Enforce type float
        for field in ['LATITUDE', 'LONGITUDE']:
            blob[field] = dd.to_numeric(arg=blob[field], errors='coerce')

        # State
        blob['STATEGEOID'] = state

        return blob
Ejemplo n.º 2
0
def test_to_numeric_on_dask_dataframe_series_with_meta():
    s = pd.Series(["1.0", "2", -3, -5.1])
    arg = from_pandas(s, npartitions=2)
    expected = pd.to_numeric(s)
    output = to_numeric(arg, meta=pd.Series([], dtype="float64"))
    assert output.dtype == "float64"
    assert isinstance(output, Series)
    assert list(output.compute()) == list(expected)
Ejemplo n.º 3
0
    def format(self, blob: dd.DataFrame):

        data = blob.copy()
        for field in self.getstringfields():
            data[field] = data[field].str.strip().fillna(value='')

        data['ZIP_CODE'] = data['ZIP_CODE'].replace(to_replace='[^0-9]', value='', regex=True)
        data['ZIP_CODE'] = data['ZIP_CODE'].replace(to_replace='', value='0', regex=False)
        data['ZIP_CODE'] = dd.to_numeric(arg=data.ZIP_CODE, errors='coerce')

        return data
Ejemplo n.º 4
0
def convert_variable_data(df, column_id, new_type, **kwargs):
    """Convert dataframe's variable to different type.
    """
    empty = df[column_id].empty if isinstance(df, pd.DataFrame) else False
    if empty:
        return df
    if new_type == vtypes.Numeric:
        if isinstance(df, dd.DataFrame):
            df[column_id] = dd.to_numeric(df[column_id], errors='coerce')
        elif is_instance(df, ks, 'DataFrame'):
            df[column_id] = ks.to_numeric(df[column_id])
        else:
            orig_nonnull = df[column_id].dropna().shape[0]
            df[column_id] = pd.to_numeric(df[column_id], errors='coerce')
            # This will convert strings to nans
            # If column contained all strings, then we should
            # just raise an error, because that shouldn't have
            # been converted to numeric
            nonnull = df[column_id].dropna().shape[0]
            if nonnull == 0 and orig_nonnull != 0:
                raise TypeError(
                    "Attempted to convert all string column {} to numeric".
                    format(column_id))
    elif issubclass(new_type, vtypes.Datetime):
        format = kwargs.get("format", None)
        # TODO: if float convert to int?
        if isinstance(df, dd.DataFrame):
            df[column_id] = dd.to_datetime(df[column_id],
                                           format=format,
                                           infer_datetime_format=True)
        elif is_instance(df, ks, 'DataFrame'):
            df[column_id] = ks.to_datetime(df[column_id],
                                           format=format,
                                           infer_datetime_format=True)
        else:
            df[column_id] = pd.to_datetime(df[column_id],
                                           format=format,
                                           infer_datetime_format=True)
    elif new_type == vtypes.Boolean:
        map_dict = {
            kwargs.get("true_val", True): True,
            kwargs.get("false_val", False): False,
            True: True,
            False: False
        }
        # TODO: what happens to nans?
        df[column_id] = df[column_id].map(map_dict).astype(np.bool)
    elif not issubclass(new_type, vtypes.Discrete):
        raise Exception("Cannot convert column %s to %s" %
                        (column_id, new_type))
    return df
Ejemplo n.º 5
0
    def remove_invalid(self, df):
        """
        Method that removes invalid observations. Examples of invalid observations are string characters in
        numeric values, as well as unexpected target values

        Parameters
        ----------
        df: Dask dataframe
            A dataframe for which invalid observations must be removed
        
        Returns
        -------
        df: Dask dataframe
            A dataframe for which invalid observations are removed
        """

        for num in self.numeric_features:
            df[num] = dd.to_numeric(df[num], errors='coerce')
        df = df[df[self.target_feature].isin(list((map(str, self.labels))))]
        return df
Ejemplo n.º 6
0
    def remove_invalid_duplicates(self, df):
        """
        Method that finds AND removes duplicated observations in a dataframe
        
        Parameters
        ----------
        dataframe : Dask dataframe
            A dataframe for which duplicates must be removed

        Returns
        ----------
        df : Dask dataframe
            A dataframe with duplicates, if any, are removed.
        
        """
        for num in self.numeric_features:
            df[num] = dd.to_numeric(df[num], errors='coerce')
        df = df[df[self.target_feature].isin(list((map(str, self.labels))))]
        df = df.drop_duplicates()
        self.df = df
        return df
Ejemplo n.º 7
0
 def coerce_code(v: dd.Series, codes: List[int]) -> dd.Series:
     # Set non-ints and unexpected codes to missing (-1)
     v = dd.to_numeric(v, errors="coerce")
     v = v.where(v.isin(codes), np.nan)
     return v.fillna(-1).astype("int8")
Ejemplo n.º 8
0
def test_to_numeric_on_dask_dataframe_dataframe_raises_error():
    s = pd.Series(["1.0", "2", -3, -5.1])
    df = pd.DataFrame({"a": s, "b": s})
    arg = from_pandas(df, npartitions=2)
    with pytest.raises(TypeError, match="arg must be a list, tuple, dask."):
        to_numeric(arg)
Ejemplo n.º 9
0
def test_to_numeric_on_dask_array():
    arg = from_array(["1.0", "2", -3, 5.1])
    expected = np.array([1.0, 2.0, -3.0, 5.1])
    output = to_numeric(arg)
    assert isinstance(output, Array)
    assert list(output.compute()) == list(expected)
Ejemplo n.º 10
0
def test_to_numeric_on_scalars(arg):
    output = to_numeric(arg)
    assert isinstance(output, Delayed)
    assert output.compute() == 5
Ejemplo n.º 11
0
    columns = pd.read_csv(root_dir + file, nrows=0).columns
    column_dtypes = [
        dtype_dict[x] for x in [column_dict[x] for x in list(columns)]
    ]

    #dtype_dummy_dict = dict(zip(columns, ['object']*len(columns)))
    #df = dd.read_csv(root_dir + file, dtype=dtype_dummy_dict, engine="python", encoding="utf-8", error_bad_lines=False)
    #df = dd.read_csv(root_dir + file, dtype=dtype_dummy_dict, delimiter=',', encoding='utf-8', header=0, quoting=csv.QUOTE_NONE, error_bad_lines=False)

    df = pd.read_csv(root_dir + file)
    df = dd.from_pandas(df, npartitions=4)

    for column in columns:
        if column_dict[column] in ["int", "float"]:
            df[column] = dd.to_numeric(df[column], errors='coerce')
        elif column_dict[column] == "date":
            df[column] = dd.to_datetime(df[column])
        elif column_dict[column] == "bool":
            df[column] = df[column].apply(convert_bool, meta=(column, 'bool'))
        else:
            df[column] = df[column].apply(normalize_string,
                                          meta=(column, 'object'))

        df[column].astype(dtype_dict[column_dict[column]])

        if column == "Project Essay":
            res = df.apply(get_polarity,
                           analyzer=sentiment_analyzer,
                           axis=1,
                           result_type='expand',
Ejemplo n.º 12
0
def test_to_numeric_raises():
    with pytest.raises(ValueError, match="invalid error value"):
        to_numeric("10", errors="invalid")
    with pytest.raises(KeyError, match="``meta`` is not allowed"):
        to_numeric("10", meta=pd.Series([], dtype="float64"))