Esempio n. 1
0
def top_series_quantile_ls(df_ls, quant=0.75):
    """
    This function uses the function top_series_quantile from df_utils.py, but applies it to each DataFrame in a lst
        of pandas DataFrames. This will select the columns in a DataFrame whose mean values are above the [quant]
        quantile of the user DataFrame, as determined by the pandas.DataFrame.quantile method once the columns have
        been averaged along the index axis. The percentile is determined individually for each DataFrame processed.

    Parameters:
    :param df_ls: A list of pandas DataFrames from which the [quant]x100th percentile of columns will be pulled.
    :param quant: The percentile at which to select only those columns whose mean value is greater.
    :return: df_percentile_ls: A list of pandas DataFrames of length len(df_ls), which are comprised of only the
        those columns whose values are above the [quant] percentile.
    """
    # Check data types to prevent errors during quantile selection
    _ec.check_ls(ls=df_ls)
    _ec.check_dfs(values=df_ls)
    _ec.check_numeric(values=[quant])
    _ec.check_threshold(values=[quant], thresh=1.0, how='under')

    # Create list to put the quantile/percentile DataFrames into
    df_percentile_ls = []
    for df_idx in range(0, len(df_ls)):
        df_percentile_ls.append(top_series_quantile(df=df_ls[df_idx], quant=quant))     # Append quantile df to new list

    return df_percentile_ls
Esempio n. 2
0
def norm_by_factors(df_ls, factor_ls=[0.0]):
    """
    This function takes in a list of pandas DataFrames and normalizes each column by a list of normalization factors.
        The list of factors and DataFrames must be of equal length. Otherwise, an error is returned using the external
        function _ec.check_eq_ls_len, which is called from _err_check.py. The normalization factors should be paired to the
        same indices as the DataFrames you wish to be normalized by each factor. For example, it will normalize
        df_ls[0] by factor_ls[0], ... df_ls[n] by factor_ls[n].
    Parameters:
    :param df_ls: List of pandas DataFrames for which you wish the columns to be normalized.
    :param factor_ls: List of normalization factors by which you wish to normalize the columns of the DataFrames in
        df_ls
    :return: df_ls_out: A list of the normalized DataFrames. The passed list of DataFrames is unchanged unless
        over-written by the returned list in the script from which this function is called.
    """
    # Check for equal length of passed lists and data types to prevent error in normalization loop.
    _ec.check_ls(ls=df_ls)
    _ec.check_ls(ls=factor_ls)
    _ec.check_eq_ls_len(list_ls=[df_ls, factor_ls])
    _ec.check_dfs(values=df_ls)
    _ec.check_numeric(values=factor_ls)

    df_ls_out = df_ls.copy()
    for df_in, factor, df_out, df_num in zip(df_ls, factor_ls, df_ls_out, range(len(df_ls))):
        for col in range(0, len(df_in.columns)):
            df_out.ix[:, col] = df_in.ix[:, col] / factor
    return df_ls_out
Esempio n. 3
0
def top_series_quantile(df, quant=0.9):
    """
    This function takes a pandas DataFrame and returns the top [quant] quantile of the time series, based
        on the mean value of each column (averaged along index). Originally designed to find the [quantile] percentile
        of time series, based on average value, assuming that all time series units are on the same scale. Uses the
        pandas built-in method, 'df.quantile(percentile)' to find the [quant] percentile.

    Parameters:
    :param df: A pandas DataFrame, containing the time series from which those with the top [quant] percentile of
        mean values are selected and returned.
    :param quant: The quantile/percentile to select from the time series present in param df.
    :return: df_quant: The top [quant] percentile of time series (columns) from the original DataFrame, based on the
        mean values of the columns.
    """
    # Verify parameter of type pandas.DataFrame, there is a numeric value for quantile, and that quantile is <=1.0.
    _ec.check_dfs(values=[df])
    _ec.check_numeric(values=[quant])
    _ec.check_threshold(values=[quant], thresh=1.0, how='under')
    # Take average of original DataFrame for processing and make a 'quant_index_list' to be able to create a new
    # returnable DataFrame from the original using a mask.
    quant_index_list = []
    df_proc = df.mean(axis=0)

    quantile = df_proc.quantile(quant)
    for i in df_proc.index:
        if df_proc[i] > quantile:
            quant_index_list.append(i)
        else:
            pass
    df_quant = df[quant_index_list]

    return df_quant
Esempio n. 4
0
def concat_ls(df_ls1, df_ls2, axis=0, join='inner'):
    """
    This function takes two lists of pandas DataFrames and concatenates them. Options allow user to specify the axis
        along which to concatenate as well as the pandas join method (e.g., 'inner', 'outer'). Where available,
        inherent pandas DataFrame functionality is employed (e.g., transpose, axis selection, join method, etc).
        Parameter choice should be in line with the requirements of the pandas library and associated functions,
        and therefore the same convention is used for parameters axis and join. DataFrames are concatenated pairwise;
        that is, df_ls1[i] is concatenated with df_ls2[i].

    Parameters:
    :param df_ls1: A list of DataFrames on which to concatenate df_ls2
    :param df_ls2: A list of DataFrames to concatenate onto the corresponding DataFrames
    :param axis: The axis along which the DataFrames will be concatenated. axis=1 for column-wise, 0 for row-wise
        (standard pandas DataFrame syntax). Example: if axis=0, DataFrames will be concatenated in the row dimension
        (i.e., stacked vertically; will require same # of columns).  If axis=1, will be concatenated in the
        column dimension (i.e., side-by-side)
    :param join: Allows user to specify the join parameter for pandas.concat(). Must be compatible with choices
        available within the pandas package.
    :return: df_list: A list of DataFrames where elements are DataFrames from list 1 concatenated onto the corresponding
        DataFrame from list 2
    """
    # Check data types to prevent errors during processing.
    _ec.check_ls(ls=df_ls1)
    _ec.check_ls(ls=df_ls2)
    _ec.check_dfs(values=df_ls1)
    _ec.check_dfs(values=df_ls2)
    _ec.check_eq_ls_len(list_ls=[df_ls1, df_ls2])
    _ec.check_numeric(values=[axis])
    _ec.param_exists_in_set(value=axis, val_set=[0, 1])
    _ec.check_string(values=[join])
    _ec.param_exists_in_set(value=join, val_set=['inner', 'outer'])

    # Initialize return list for concatenated DataFrames
    df_ls_concat = []
    # Check row or column lengths of lists to make sure they're the same.  If not, tell user, but try to proceed.
    if axis == 0:
        for df1, df2 in zip(df_ls1, df_ls2):
            if df1.shape[1] != df2.shape[1]:
                print('WARNING: You chose concatenation in row dimension (i.e., vertical stacking) with '
                      'parameter axis=0,\n but some DataFrame pairs have different numbers of columns.  Proceeding...')
            else:
                pass
    elif axis == 1:
        for df1, df2 in zip(df_ls1, df_ls2):
            if df1.shape[0] != df2.shape[0]:
                print('WARNING: You chose to concatenate in column dimension (side-by-side) with axis=1, but'
                      'some DataFrame pairs have different number of rows.  Proceeding...')
    else:
        print('ERROR: Parameter axis must be set to 0 or 1')
        sys.exit()

    for df1, df2 in zip(df_ls1, df_ls2):
        df_ls_concat.append(pd.concat([df1, df2], axis=axis, join=join))

    return df_ls_concat
Esempio n. 5
0
def idx0(df):
    """
    This function subtracts the DataFrame's first index value from all values in the index. This was designed
        with the aim of create a "t_0" (or "time elapsed") time series - that is, a time series where the first
         time index is t = 0. This function does not account for units of time and subtracts the first index value
         from all other values in the index. Use is only suggested for DataFrames containing time series, but it
         should function with all numeric indices.

    Parameters:
    :param df: pandas DataFrame for which the index is to be offset.
    :return: pandas DataFrame with offset index.
    """
    # Check to make sure index values are numeric and in a pandas DataFrame.
    _ec.check_dfs(values=[df])
    _ec.check_numeric(values=df.index.values)
    df_idx0 = df.copy()
    df_idx0.index = (df.index.values - df.index.values[0])

    return df_idx0
Esempio n. 6
0
def concat_trans_ls(df_ls1, df_ls2, axis=0, join='inner', pad=True, rep_colnames=True, pad_name=''):
    """
    This function takes two lists of pandas DataFrames and concatenates them, after transposing the second.
        Options allow user to specify the axis along which to concatenate as well as the pandas join method
        (e.g., 'inner', 'outer'). Where available, inherent pandas DataFrame functionality is employed
        (e.g., transpose, axis selection, join method, etc). Parameter choice should be in line with the requirements
        of the pandas library and associated functions, and therefore the same convention is used for parameters
        axis and join. DataFrames are concatenated pairwise; that is, df_ls1[i] is concatenated with df_ls2[i].
        Additional options are available through the parameters rep_colnames, pad, and padName

        Note, when using this that you will likely run into pandas errors if the transposed version of the second
        DataFrame has a different number of columns than the corresponding DataFrame in df_ls1.

    Parameters:
    :param df_ls1: list of DataFrames on which to concatenate the second list, df_ls2
    :param df_ls2: list of DataFrames to transpose and concatenate
    :param axis: axis=1 for columns, 0 for rows (standard pandas DataFrame syntax). Ex: if axis=0, DataFrames will
        be concatenated in the row dimension (i.e., stacked; may require same # of columns). If axis=1, will be
        concatenated in the column dimension (i.e., side-by-side)
    :param join: Join method a used by pandas.concat
    :param pad: Lets the user select whether or not to pad the two datasets with a blank row.  Can specify column
            name to add to this blank row with parameter pad_name.
    :param pad_name: optional info to add by user to name the padded row added between the datasets. leave blank for
            an empty index (nan).
    :param rep_colnames: option to replicate the column names after the padding.  This will add the column names from
        the first DataFrame into the padding between the two concatenated DataFrames
    :return: df_concat_ls - list of DataFrames where elements are DataFrames from list 1 concatenated onto the DataFrame
     from list 1
    """
    # Check parameter data types, list lengths, and values to prevent errors during processing
    _ec.check_ls(ls=df_ls1)
    _ec.check_ls(ls=df_ls2)
    _ec.check_dfs(values=df_ls1)                    # DataFrame lists
    _ec.check_dfs(values=df_ls2)
    _ec.check_eq_ls_len(list_ls=[df_ls1, df_ls2])
    _ec.check_numeric(values=[axis])                # axis
    _ec.param_exists_in_set(value=axis, val_set=[0, 1])
    _ec.check_bool(values=[pad])
    _ec.check_bool(values=[rep_colnames])
    _ec.check_string(values=[pad_name])

    # Initialize internal function variables and return list
    df_concat_ls = []
    # check row or column lengths of lists to make sure they're the same.  If not, tell user, but try to proceed
    if axis == 0:
        for df1, df2 in zip(df_ls1, df_ls2):
            if df1.shape[1] != df2.T.shape[1]:
                print('WARNING: You chose concatenation in row dimension (i.e., stacking) with parameter axis=0,\n'
                      'but some DataFrame pairs have different numbers of columns.  Proceeding...')
            else:
                pass
    elif axis == 1:
        for df1, df2 in zip(df_ls1, df_ls2):
            if df1.shape[0] != df2.T.shape[0]:
                print('WARNING: You chose to concatenate in column dimension (side by side) with axis=1, but'
                      'some DataFrame pairs have different number of rows.  Proceeding...')
    else:
        print('ERROR: Parameter axis must be set to 0 or 1')
        sys.exit()

    # Proceed with concatenation
    for df1, df2 in zip(df_ls1, df_ls2):
        # Create pad row if selected, and pad b/t the two DataFrames in current pair
        if pad:
            padding = pd.DataFrame(index=['', pad_name], columns=df1.columns)
            if rep_colnames:
                padding.values[1] = df1.columns.values
            else:
                pass
            df_concat_ls.append(pd.concat([df1, padding, df2.T], axis=axis, join=join))
        else:
            df_concat_ls.append(pd.concat([df1, df2.T], axis=axis, join=join))
    return df_concat_ls