コード例 #1
0
def relocate(data, cols, before=None, after=None):
    """Use relocate() to change column positions

    Parameters
    ----------
    data: pandas or pyspark DataFrame
        A dataframe
    cols: str or list
         Columns to move
    before: str, default=None
        Destination of columns. Default is to move to the very left.
    after: str, default=None
        Destination of columns. Default is to move the very left.

    Returns
    -------
    Our dataframe, but with the columns repositioned
    """
    is_pandas = _check_df_type(data, 'rename')
    if isinstance(cols, str):
        cols = _get_str_columns(data, cols, is_pandas=is_pandas)
    elif isinstance(cols, list):
        cols = _get_list_columns(data, cols, is_pandas)
    if is_pandas:
        if before is None and after is None:
            new_cols = [cols] + data.columns.difference([cols]).tolist()
            return_data = data[new_cols]
        elif isinstance(before, str):
            if isinstance(after, str):
                raise ValueError(
                    "Only one of before or after can be specified")
            if isinstance(cols, str):
                cols_to_move = data.pop(cols).values
                data.insert(data.columns.get_loc(before), cols, cols_to_move)
            else:
                for col in cols:
                    cols_to_move = data.pop(col).values
                    data.insert(
                        data.columns.get_loc(before) - 1, col, cols_to_move)
            return_data = data.copy()
        elif isinstance(after, str):
            if isinstance(cols, str):
                cols_to_move = data.pop(cols).values
                data.insert(
                    data.columns.get_loc(after) + 1, cols, cols_to_move)
            else:
                for i, col in enumerate(cols):
                    cols_to_move = data.pop(cols).values
                    data.insert(
                        data.columns.get_loc(after) + (i + 1), col,
                        cols_to_move)
            return_data = data.copy()
        else:
            raise TypeError(
                "One of before or after must be in string format, or both set to None"
            )
        return return_data
    else:
        ...
コード例 #2
0
def add_count(data, cols, wt=None, sort=False, name=None):
    """Count observations by group

    Parameters
    ----------
    data: pandas or pyspark DataFrame
        A dataframe
    cols: str or list
         Optional variables to use when determining uniqueness. If there are multiple rows for a given combination of inputs,
         only the first row will be preserved. If None, will use all variables.
    wt: str of list, default is None
         Frequency weights. Can be a variable (or combination of variables) or None. wt is computed once for each unique
         combination of the counted variables.
    sort: bool, default is False
        If True, will show the largest groups at the top.
    name: str, default is None
        The name of the new column in the output. If omitted, it will default to n. If there's already a column called n,
        it will error, and require you to specify the name.

    Returns
    -------
    Our dataframe, but with an additional column including the sum or count
    """
    is_pandas = _check_df_type(data, "add_count")
    if isinstance(cols, str):
        distinct_cols = _get_str_columns(data, cols, is_pandas=is_pandas)
    elif isinstance(cols, list):
        distinct_cols = _get_list_columns(data, cols, is_pandas=is_pandas)
    else:
        if is_pandas:
            distinct_cols = data.columns
        else:
            distinct_cols = data.schema.names
    if name is None:
        name = 'n'
    if is_pandas:
        if wt is None:
            # While it would be nice to simply have df[name] = df.groupby([col])[col].transform('count'), that only
            # works when we know that col is one column. Otherwise, we will be trying to set one column with 2+ columns,
            # which will throw an error. So we instead create our Series/DataFrame and then perform a check to ensure
            # that we don't try and create a new column with several.
            groupby_data = data.groupby(
                distinct_cols)[distinct_cols].transform('count')
            if isinstance(groupby_data, pd.Series):
                data[name] = groupby_data
            else:
                data[name] = groupby_data.iloc[:, 0]
        else:
            # We know that wt has to be a single column, so we can afford to automatically create a new column and not
            # have to worry about errors caused by setting a column from several.
            data[name] = data.groupby(distinct_cols)[wt].transform('sum')
        if sort:
            data = data.sort_values(by=name, ascending=False)
    else:
        ...
    return data
コード例 #3
0
def arrange(data, cols):
    """Arrange rows by column values

    Parameters
    ---------
    data: pandas or pysparkDataFrame
        A dataframe
    cols: str or list
        The columns we are sorting the dataframe on

    Returns
    -------
    Our sorted dataframe

    For example, suppose we had a dataframe like
    a b
    1 2
    3 4
    5 6

    Then running arrange(df, desc(a)) will return
    a b
    5 6
    3 4
    1 2
    """
    is_pandas = _check_df_type(data, "arrange")
    if isinstance(cols, str):
        columns = _get_str_columns(data, cols, is_pandas=is_pandas)
    elif isinstance(cols, list):
        columns = _get_list_columns(data, cols, is_pandas)
    else:
        raise TypeError("Cannot determine method for determining column types")
    sorting_cols = []
    ascending_cols = []
    for c in columns:
        if re.search('desc()', c):
            # Here, we are removing the des() from our string, but not whitespace as we may have a column like ' col1'
            sorting_cols.append(re.sub(r'desc|\(|\)|\s+', r'', c))
            ascending_cols.append(False)
        else:
            sorting_cols.append(c)
            ascending_cols.append(True)
    if is_pandas:
        # Here, we are resetting the index because R's implementation also resets the index. Thus, we also need to drop
        # our prior index
        return data.sort_values(
            sorting_cols, ascending=ascending_cols).reset_index().drop('index',
                                                                       axis=1)
    else:
        return data.orderBy(sorting_cols, ascending=ascending_cols)
コード例 #4
0
def distinct(data, cols=None, keep_all=False):
    """Subset distinct/unique rows

    Parameters
    ----------
    data: pandas or pyspark DataFrame
        A dataframe
    cols: str or list, default is None
         Optional variables to use when determining uniqueness. If there are multiple rows for a given combination of inputs,
         only the first row will be preserved. If None, will use all variables.
    keep_all: bool, default is False
        If True, keep all variables in data. If a combination of cols is not distinct, this keeps the first row of values.

    Returns
    -------
    Our dataframe but with unique rows specified
    """
    is_pandas = _check_df_type(data, "distinct")
    if isinstance(cols, str):
        distinct_cols = _get_str_columns(data, cols, is_pandas=is_pandas)
    elif isinstance(cols, list):
        distinct_cols = _get_list_columns(data, cols, is_pandas=is_pandas)
    else:
        if is_pandas:
            distinct_cols = data.columns
        else:
            distinct_cols = data.schema.names
    if is_pandas:
        if keep_all:
            # Here, we need to find the distinct values, and then perform a left merge on the distinct values with the
            # remaining columns to ensure that we have all available columns. We then reset the index as that is what
            # R's implementation does.
            dropped_data = data[distinct_cols].drop_duplicates(keep='first')
            dropped_data = pd.merge(dropped_data,
                                    data.drop(distinct_cols, axis=1),
                                    left_index=True,
                                    right_index=True,
                                    how='left')
            dropped_data.index = np.arange(len(dropped_data))
            return dropped_data
        else:
            return data[distinct_cols].drop_duplicates(keep='first',
                                                       ignore_index=True)
    else:
        ...
コード例 #5
0
def select(data, cols):
    """Select variables in a data frame

    Parameters
    ----------
    data: pandas or pyspark DataFrame
        A dataframe
    cols: str or list
         Columns to move

    Returns
    -------
    Our dataframe, but with the selected columns
    """
    is_pandas = _check_df_type(data, "select")
    if isinstance(cols, str):
        cols = _get_str_columns(data, cols, is_pandas=is_pandas)
    elif isinstance(cols, list):
        cols = _get_list_columns(data, cols, is_pandas)
    if is_pandas:
        return data.loc[:, cols]
    else:
        return data.select(*cols)
コード例 #6
0
def count(data, cols, wt=None, sort=False, name=None, drop=True):
    """Count observations by group

    Parameters
    ----------
    data: pandas or pyspark DataFrame
        A dataframe
    cols: str or list
         Optional variables to use when determining uniqueness. If there are multiple rows for a given combination of inputs,
         only the first row will be preserved. If None, will use all variables.
    wt: str of list, default is None
         Frequency weights. Can be a variable (or combination of variables) or None. wt is computed once for each unique
         combination of the counted variables.
    sort: bool, default is False
        If True, will show the largest groups at the top.
    name: str, default is None
        The name of the new column in the output. If omitted, it will default to n. If there's already a column called n,
        it will error, and require you to specify the name.
    drop: bool, default is True
        If False will include counts for empty groups (i.e. for levels of factors that don't exist in the data)

    Returns
    -------
    A pandas series with our counts or sums
    """
    is_pandas = _check_df_type(data, "count")
    if isinstance(cols, str):
        distinct_cols = _get_str_columns(data, cols, is_pandas=is_pandas)
    elif isinstance(cols, list):
        distinct_cols = _get_list_columns(data, cols, is_pandas=is_pandas)
    else:
        if is_pandas:
            distinct_cols = data.columns
        else:
            distinct_cols = data.schema.names
    if name is None:
        name = 'n'
    if is_pandas:
        # Here, we are treating our groupby columns as factors/categorical data, so we need to convert it. This also
        # allows us to incorporate the drop() command, as otherwise it would serve no purpose.
        data[distinct_cols] = data[distinct_cols].astype('category')
        if wt is None:
            count_df = data.groupby(distinct_cols).count()
        else:
            count_df = data.groupby(distinct_cols)[wt].sum()
        # We don't need multiple columns containing the same counts, so we choose the first one and rename it
        if isinstance(count_df, pd.DataFrame):
            count_df = count_df.rename({count_df.columns[0]: name}, axis=1)
            count_df = count_df.iloc[:, 0]
        # We have one column/Series, so all we need to do is rename it
        else:
            count_df = count_df.rename(name)
        # Drops any column that contains NaN (i.e., categories that don't have any observations)
        if drop:
            count_df = count_df.dropna()
        # Replaces the NaNs (i.e., categories that don't have any observations) with 0
        else:
            count_df = count_df.fillna(0)
        # Since we know that count_df is going to be a Series, we don't need to specify the axis or the column names, as
        # those are already known for a Series
        if sort:
            count_df = count_df.sort_values(ascending=False)
        count_df = count_df.reset_index()
    else:
        ...
    return count_df
コード例 #7
0
def filter(data, cols):
    """Filters data based on arguments from cols

    Parameters
    ----------
    data: pandas or pyspark DataFrame
        The dataframe for which we filtering the data on
    cols: str or list
        The filter conditions we are applying on our dataframe

    Returns
    -------
    filtered_data: pandas DataFrame
        The dataframe, after we've applied all filtering conditions

    For example, suppose we had a dataframe like
    a b
    1 2
    3 4
    5 6

    Then running filter(df, "a >= median(a)") will return
    a
    3
    5
    """
    is_pandas = _check_df_type(data, "filter")
    if isinstance(cols, str):
        cols = _get_str_columns(data, cols, is_pandas=is_pandas)
    elif isinstance(cols, list):
        cols = _get_list_columns(data, cols, is_pandas)
    query_result = []
    for c in cols:
        if "mean(" in c.casefold():
            mean_col = re.search(r'(?<=mean\()[a-zA-Z]+', c).group(0)
            if is_pandas:
                val = data[mean_col].mean()
            else:
                ...
            comparison = re.search(r'([<>]=?|==)', c).group(0)
            result = '{} {} {}'.format(mean_col, comparison, val)
        elif "median(" in c.casefold():
            median_col = re.search(r'(?<=median\()[a-zA-Z]+', c).group(0)
            if is_pandas:
                val = data[median_col].median()
            else:
                ...
            comparison = re.search(r'([<>]=?|==)', c).group(0)
            result = '{} {} {}'.format(median_col, comparison, val)
        elif "min(" in c.casefold():
            min_col = re.search(r'(?<=min\()[a-zA-Z]+', c).group(0)
            if is_pandas:
                val = data[min_col].min()
            else:
                ...
            comparison = re.search(r'([<>]=?|==)', c).group(0)
            result = '{} {} {}'.format(min_col, comparison, val)
        elif "max(" in c.casefold():
            max_col = re.search(r'(?<=max\()[a-zA-Z]+', c).group(0)
            if is_pandas:
                val = data[max_col].max()
            else:
                ...
            comparison = re.search(r'([<>]=?|==)', c).group(0)
            result = '{} {} {}'.format(max_col, comparison, val)
        elif "quantile(" in c.casefold():
            quantile_col = re.search(r'(?<=quantile\()[a-zA-Z]+', c).group(0)
            if re.search('probs=', c):
                quantile_percent = float(
                    re.search(r'(?<=probs\=)\s*\d*\.\d+', c).group(0))
            else:
                quantile_percent = float(
                    re.search(r'(?<=,)\s*\d*\.\d+', c).group(0))
            if quantile_percent > 1:
                raise Exception("Cannot have percentile greater than 1")
            comparison = re.search(r'([<>]=?|==)', c).group(0)
            if is_pandas:
                val = data[quantile_col].quantile(quantile_percent)
            else:
                ...
            result = '{} {} {}'.format(quantile_col, comparison, val)
        else:
            result = c
        query_result.append(result)
    if is_pandas:
        return data.query(' & '.join(query_result)).reset_index().drop(
            ['index'], axis=1)
    else:
        return data.filter(' and '.join(query_result))