def __init__(self, file_path: str = []):

        self.file_path = file_path
        self.df = clean_data(load_csv(self.file_path))
        self.df_multi = clean_data(load_csv(self.file_path))

        self.base = os.path.basename(self.file_path).split('.')[0]
        self.base_ext = os.path.basename(self.file_path)
        self.dirname = os.path.dirname(self.file_path)
        self.sample_headers = process_header_data(self.df, HeaderType.SAMPLE)
        self.sample_headers_dict = {
            'name': [sample.name for sample in self.sample_headers],
            'label': [sample.label for sample in self.sample_headers]
        }

        self.sample_headers_name = [i.name for i in self.sample_headers]
        self.sample_headers_label = [i.label for i in self.sample_headers]
        self.depth_headers = process_header_data(self.df, HeaderType.DEPTH)
        self.depth_headers_name = [i.name for i in self.depth_headers]
        self.depth_headers_label = [i.label for i in self.depth_headers]
        self.year_headers = process_header_data(self.df, HeaderType.YEARS)
        self.year_headers_name = [i.name for i in self.year_headers]
        self.year_headers_label = [i.label for i in self.year_headers]
        self.sample_df = self.df[self.sample_headers_name]
        self.depth_df = self.df[self.depth_headers_name]
        self.headers = process_header_data(self.df)
        self.names = [i.name for i in self.headers]
        self.value = [i.htype.value for i in self.headers]

        self.df_multi.columns = [self.value, self.names]

        self.dx = self.df_multi.xs('Sample', axis=1).describe().T
    def testProcessHeaderData(self):

        unknown_headers = ['test_sample (ppb)', 'test2 Not in (ppb)']

        df = DataFrame(numpy.random.randn(10, 2), columns=unknown_headers)

        logging.disable(logging.CRITICAL)
        headers = process_header_data(df)

        self.assertEqual(HeaderType.UNKNOWN, headers[0].htype)
        self.assertEqual(HeaderType.UNKNOWN, headers[1].htype)

        unknown_parsed_headers = process_header_data(df, HeaderType.UNKNOWN)

        self.assertEqual(2, len(unknown_parsed_headers))

        mixed_headers = ["Dat011216V2", "depth (m we) ", "Sr (ng/L)"]
        df = DataFrame(numpy.random.randn(10, 3), columns=mixed_headers)

        year_h = process_header_data(df, HeaderType.YEARS)
        depth_h = process_header_data(df, HeaderType.DEPTH)
        sample_h = process_header_data(df, HeaderType.SAMPLE)

        self.assertEqual(HeaderType.YEARS, year_h[0].htype)
        self.assertEqual(HeaderType.DEPTH, depth_h[0].htype)
        self.assertEqual(HeaderType.SAMPLE, sample_h[0].htype)
def univariate_spline(df: DataFrame, var=45):

    sample_header_names = [
        h.name for h in process_header_data(df, HeaderType.SAMPLE)
    ]
    depth = 'depth (m abs)'

    x = df[depth]
    xs = np.linspace(min(x), max(x), var)
    xs_dict = {depth: pandas.Series(xs)}

    for sample_header_name in sample_header_names:
        y = df[sample_header_name]
        spl = UnivariateSpline(x, y)
        new_spl = pandas.Series(spl(xs))
        spline_xs = {sample_header_name: new_spl}
        xs_dict.update(spline_xs)
    spline_df = pandas.concat(xs_dict, axis=1)

    colnames = spline_df.columns.tolist()

    colnames = colnames[-1:] + colnames[:-1]

    spline_df = spline_df[colnames]
    return spline_df
def normalize_data(df: DataFrame):

    sample_header_names = [
        h.name for h in process_header_data(df, HeaderType.SAMPLE)
    ]
    df[sample_header_names] = df[sample_header_names].transform(
        lambda X: (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)))
    return df
def robust_scaler(df: DataFrame) -> DataFrame:

    sample_header_names = [
        h.name for h in process_header_data(df, HeaderType.SAMPLE)
    ]
    rob_scaler = lambda x: preprocessing.robust_scale(x.to_frame()).flatten()
    df[sample_header_names] = df[sample_header_names].transform(rob_scaler)

    return df
def scaler(df: DataFrame) -> DataFrame:

    sample_header_names = [
        h.name for h in process_header_data(df, HeaderType.SAMPLE)
    ]
    scaler_scaler = lambda x: preprocessing.scale(x)
    df[sample_header_names] = df[sample_header_names].transform(scaler_scaler)

    return df
    def __init__(self, df: DataFrame):

        self.df = df
        self.sample_headers = process_header_data(self.df, HeaderType.SAMPLE)
        self.sample_headers_name = [i.name for i in self.sample_headers]
        self.sample_headers_label = [i.label for i in self.sample_headers]
        self.sample_headers_class = [i.hclass for i in self.sample_headers]
        self.depth_headers = process_header_data(self.df, HeaderType.DEPTH)
        self.depth_headers_name = [i.name for i in self.depth_headers]
        self.depth_headers_label = [i.label for i in self.depth_headers]
        self.year_headers = process_header_data(self.df, HeaderType.YEARS)
        self.year_headers_name = [i.name for i in self.year_headers]
        self.year_headers_label = [i.label for i in self.year_headers]
        self.sample_df = self.df[self.sample_headers_name]
        self.depth_df = self.df[self.depth_headers_name]
        self.year_df = self.df[self.year_headers_name]
        self.sample_year_df = self.df[self.year_headers_name +
                                      self.sample_headers_name]
        self.year_sample_headers = self.year_headers + self.sample_headers
def quantile_transform_scaler(df: DataFrame) -> DataFrame:

    sample_header_names = [
        h.name for h in process_header_data(df, HeaderType.SAMPLE)
    ]
    quant_trans = lambda x: preprocessing.quantile_transform(x.to_frame()
                                                             ).flatten()
    df[sample_header_names] = df[sample_header_names].transform(quant_trans)

    return df
def lfilter_filter(df: DataFrame) -> DataFrame:

    b, a = butter(2, 0.1)
    sample_header_names = [
        h.name for h in process_header_data(df, HeaderType.SAMPLE)
    ]
    lfilter_func = lambda x: lfilter(b, a, x)
    df[sample_header_names] = df[sample_header_names].transform(lfilter_func)

    return df
def normalize_min_max_scaler(df: DataFrame) -> DataFrame:
    '''
    Normalize dataframe by min and max
    doesn't take nan values
    :param df:
    '''
    sample_header_names = [
        h.name for h in process_header_data(df, HeaderType.SAMPLE)
    ]
    min_max_scaler = lambda x: preprocessing.minmax_scale(x)
    df[sample_header_names] = df[sample_header_names].transform(min_max_scaler)

    return df
def wiener_filter(df: DataFrame):
    '''
    Apply the  spline filter to the columns of the supplied data.  
    The filter is only applied to columns that appear as samples in the default 
    header dictionary. Modifications occur in-place.
    
    :param df: The data to filter
    :return: The resampled data
    '''
    sample_header_names = [
        h.name for h in process_header_data(df, HeaderType.SAMPLE)
    ]
    wiener_func = lambda x: wiener(x)
    df[sample_header_names] = df[sample_header_names].transform(wiener_func)

    return df
def replace_outliers(df: DataFrame,
                     val: float64 = np.nan,
                     num_std: float = 3) -> DataFrame:
    '''
    Replace the outliers in the data on a column based calculation.  The mean 
    and standard deviation for each column is calculated to use.
    
    :param df: The data to replace outliers in
    :param val: The new value to use (the default is :data:`np.nan`)
    :param num_std: The number of standard deviations to use as a threshold
    :return: Data with values outside the threshold replaced
    '''
    sample_header_names = [
        h.name for h in process_header_data(df, HeaderType.SAMPLE)
    ]
    df[sample_header_names] = df[sample_header_names].transform(
        lambda s: replace(s, val, num_std))
    return df
def savgol_smooth_filter(df: DataFrame, window_length: int = 7):
    '''
    Apply the  Savitzky-Golay filter to the columns of the supplied data.  
    The filter is only applied to columns that appear as samples in the default 
    header dictionary. Modifications occur in-place.
    
    :param df: The data to filter
    :return: The resampled data
    '''
    if window_length % 2 == 0:  # window_length must be odd
        window_length = window_length - 1

    sample_header_names = [
        h.name for h in process_header_data(df, HeaderType.SAMPLE)
    ]
    savgol_func = lambda x: savgol_filter(x, window_length, 1)
    df[sample_header_names] = df[sample_header_names].transform(savgol_func)

    return df