Esempio n. 1
0
def top_series_max(df, n_series=10):
    """
    This function takes a pandas DataFrame as input and then selects the top [n_series] series (i.e., columns),
        based on the maximum value of the column over all indices. A smaller DataFrame with [n_series] number of
        columns is returned. The function was initially written for DataFrames containing time series, but could
        work for any index if the maximum of column values are comparable on the same scale. Otherwise, the results
        may not make sense in the context of the data being processed.

    Parameters:
    :param df: A pandas DataFrame in which the
    :param n_series: The user-specified number of time series to choose as the 'top' time series; that is, the time
        series with the top [n_series] maximum values in the set.
    :return:df_top: The returned pandas DataFrame that is a subset of the original DataFrame and contains [n_series]
        columns with the highest maximums (on a column-wise basis) in the original set.
            :max_idx_list: The indices of the maximum maximum values, as found in df_avg, the maximum values of the
        columns in the original DataFrame.
    """
    # Check input types to prevent errors in subsequent loops.
    _ec.check_dfs(values=[df])
    _ec.check_int(values=[n_series])

    # Create list of indices (columns) that contain the n_series highest maximum values in order
    # to later make the returned df_top DataFrame.
    max_idx_list = []
    df_max = df.max(axis=0)

    for i in range(0, n_series):
        max_idx = df_max.idxmax()
        max_idx_list.append(df_max.idxmax())
        df_max.drop(df_max.idxmax(), axis=0, inplace=True)

    # Create df_top to return, along with the indices labels of the maximum values.
    df_top = df.ix[:, max_idx_list]

    return df_top, max_idx_list
Esempio n. 2
0
def top_series_max_ls(df_ls, n_series=10):
    """
    This function takes a list of pandas DataFrames as input and then selects the top [n_series] series
        (i.e., columns) for each DataFrame, based on the average value of the column over all indices.
        A smaller DataFrame with [n_series] number of columns is returned for each DataFrame in the list.
        The function was initially written for DataFrames containing time series, but could work for any
        index if the average of column values are comparable on the same scale. Otherwise, the results may
        not make sense in the context of the data being processed.
    Parameters:
    :param df_ls: A pandas DataFrame in which the
    :param n_series: The user-specified number of time series to choose as the 'top' time series; that is, the time
        series with the top [n_series] average values in the set.
    :return:df_top_ls: The returned pandas DataFrame that is a subset of the original DataFrame and contains [n_series]
        columns with the highest averages (on a column-wise basis) in the original set.
            :max_idx_list: The indices of the maximum average values, as found in df_avg, the average values of the
        columns in the original DataFrame.
    """
    print("Picking the top %i time series by maximum column values for %i DataFrames" % (n_series, len(df_ls)))
    # Check input types to prevent errors in subsequent loops.
    _ec.check_ls(ls=df_ls)
    _ec.check_dfs(values=df_ls)
    _ec.check_int(values=[n_series])

    # Create list of indices (columns) that contain the n_series highest average values in order
    # to later make the returned df_top DataFrame.
    df_top_ls = []
    max_idx_ls_all = []
    for df in df_ls:
        max_idx_list = []
        df_max = df.max(axis=0)
        # Loop through average DataFrame to find the top n_series indices, based on maximum values.
        for i in range(0, n_series):
            # max_idx = df_max.idxmax()
            max_idx_list.append(df_max.idxmax())
            df_max.drop(df_max.idxmax(), axis=0, inplace=True)
        # Create df_top to return, along with the indices labels of the maximum average values.
        # df_top = df.ix[:, max_idx_list]
        # Append to list of top DataFrames and list-of-list of top indices for each DataFrame to return.
        df_top_ls.append(df.ix[:, max_idx_list])
        max_idx_ls_all.append(max_idx_list)

    return df_top_ls, max_idx_ls_all
Esempio n. 3
0
def truncate_dfs(df_ls, min_rows=1000):
    """
    This function takes a list of pandas DataFrames and truncates them to the number of rows specified by parameter
        min_rows. By default, it truncates the DataFrames to 1000 rows.

    Parameters:
    :param df_ls: List of pandas DataFrames which will be truncated.
    :param min_rows: The number of rows to truncate each DataFrame to.
    :return: Nothing. The original DataFrames are modified in-place.
    """
    # Check that data types are those expected.
    _ec.check_ls(ls=df_ls)
    _ec.check_int(values=[min_rows])
    _ec.check_dfs(values=df_ls)

    for df in df_ls:
        cut = df.shape[0] - min_rows
        if cut > 0:
            df.drop(df.index[-cut:], inplace=True)
        else:
            pass
    return 0
Esempio n. 4
0
def find_min_rows(df_ls, max_len=99999999999):
    """
    This function finds the smallest number of rows in a pandas DataFrame from those present in a list of DataFrames,
        df_ls. This minimum number of rows can then be used to

    Parameters:
    :param df_ls: list of pandas DataFrames in which to find the minimum number of rows in the set.
    :param max_len: The maximum number of rows in DataFrames. If all DataFrames have more than max_len rows, then the
        returned min_rows = max_len
    :return: nrows_min: The number of rows in the DataFrame with the smallest number of rows.
    """
    _ec.check_ls(ls=df_ls)
    _ec.check_dfs(values=df_ls)
    _ec.check_int(values=[max_len])

    nrows_min = max_len
    for df in df_ls:
        if df.shape[0] < nrows_min:
            nrows_min = df.shape[0]
        else:
            pass
    return nrows_min