def __init__(self, file_path: str = []): self.file_path = file_path self.df = clean_data(load_csv(self.file_path)) self.df_multi = clean_data(load_csv(self.file_path)) self.base = os.path.basename(self.file_path).split('.')[0] self.base_ext = os.path.basename(self.file_path) self.dirname = os.path.dirname(self.file_path) self.sample_headers = process_header_data(self.df, HeaderType.SAMPLE) self.sample_headers_dict = { 'name': [sample.name for sample in self.sample_headers], 'label': [sample.label for sample in self.sample_headers] } self.sample_headers_name = [i.name for i in self.sample_headers] self.sample_headers_label = [i.label for i in self.sample_headers] self.depth_headers = process_header_data(self.df, HeaderType.DEPTH) self.depth_headers_name = [i.name for i in self.depth_headers] self.depth_headers_label = [i.label for i in self.depth_headers] self.year_headers = process_header_data(self.df, HeaderType.YEARS) self.year_headers_name = [i.name for i in self.year_headers] self.year_headers_label = [i.label for i in self.year_headers] self.sample_df = self.df[self.sample_headers_name] self.depth_df = self.df[self.depth_headers_name] self.headers = process_header_data(self.df) self.names = [i.name for i in self.headers] self.value = [i.htype.value for i in self.headers] self.df_multi.columns = [self.value, self.names] self.dx = self.df_multi.xs('Sample', axis=1).describe().T
def testProcessHeaderData(self): unknown_headers = ['test_sample (ppb)', 'test2 Not in (ppb)'] df = DataFrame(numpy.random.randn(10, 2), columns=unknown_headers) logging.disable(logging.CRITICAL) headers = process_header_data(df) self.assertEqual(HeaderType.UNKNOWN, headers[0].htype) self.assertEqual(HeaderType.UNKNOWN, headers[1].htype) unknown_parsed_headers = process_header_data(df, HeaderType.UNKNOWN) self.assertEqual(2, len(unknown_parsed_headers)) mixed_headers = ["Dat011216V2", "depth (m we) ", "Sr (ng/L)"] df = DataFrame(numpy.random.randn(10, 3), columns=mixed_headers) year_h = process_header_data(df, HeaderType.YEARS) depth_h = process_header_data(df, HeaderType.DEPTH) sample_h = process_header_data(df, HeaderType.SAMPLE) self.assertEqual(HeaderType.YEARS, year_h[0].htype) self.assertEqual(HeaderType.DEPTH, depth_h[0].htype) self.assertEqual(HeaderType.SAMPLE, sample_h[0].htype)
def univariate_spline(df: DataFrame, var=45): sample_header_names = [ h.name for h in process_header_data(df, HeaderType.SAMPLE) ] depth = 'depth (m abs)' x = df[depth] xs = np.linspace(min(x), max(x), var) xs_dict = {depth: pandas.Series(xs)} for sample_header_name in sample_header_names: y = df[sample_header_name] spl = UnivariateSpline(x, y) new_spl = pandas.Series(spl(xs)) spline_xs = {sample_header_name: new_spl} xs_dict.update(spline_xs) spline_df = pandas.concat(xs_dict, axis=1) colnames = spline_df.columns.tolist() colnames = colnames[-1:] + colnames[:-1] spline_df = spline_df[colnames] return spline_df
def normalize_data(df: DataFrame): sample_header_names = [ h.name for h in process_header_data(df, HeaderType.SAMPLE) ] df[sample_header_names] = df[sample_header_names].transform( lambda X: (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))) return df
def robust_scaler(df: DataFrame) -> DataFrame: sample_header_names = [ h.name for h in process_header_data(df, HeaderType.SAMPLE) ] rob_scaler = lambda x: preprocessing.robust_scale(x.to_frame()).flatten() df[sample_header_names] = df[sample_header_names].transform(rob_scaler) return df
def scaler(df: DataFrame) -> DataFrame: sample_header_names = [ h.name for h in process_header_data(df, HeaderType.SAMPLE) ] scaler_scaler = lambda x: preprocessing.scale(x) df[sample_header_names] = df[sample_header_names].transform(scaler_scaler) return df
def __init__(self, df: DataFrame): self.df = df self.sample_headers = process_header_data(self.df, HeaderType.SAMPLE) self.sample_headers_name = [i.name for i in self.sample_headers] self.sample_headers_label = [i.label for i in self.sample_headers] self.sample_headers_class = [i.hclass for i in self.sample_headers] self.depth_headers = process_header_data(self.df, HeaderType.DEPTH) self.depth_headers_name = [i.name for i in self.depth_headers] self.depth_headers_label = [i.label for i in self.depth_headers] self.year_headers = process_header_data(self.df, HeaderType.YEARS) self.year_headers_name = [i.name for i in self.year_headers] self.year_headers_label = [i.label for i in self.year_headers] self.sample_df = self.df[self.sample_headers_name] self.depth_df = self.df[self.depth_headers_name] self.year_df = self.df[self.year_headers_name] self.sample_year_df = self.df[self.year_headers_name + self.sample_headers_name] self.year_sample_headers = self.year_headers + self.sample_headers
def quantile_transform_scaler(df: DataFrame) -> DataFrame: sample_header_names = [ h.name for h in process_header_data(df, HeaderType.SAMPLE) ] quant_trans = lambda x: preprocessing.quantile_transform(x.to_frame() ).flatten() df[sample_header_names] = df[sample_header_names].transform(quant_trans) return df
def lfilter_filter(df: DataFrame) -> DataFrame: b, a = butter(2, 0.1) sample_header_names = [ h.name for h in process_header_data(df, HeaderType.SAMPLE) ] lfilter_func = lambda x: lfilter(b, a, x) df[sample_header_names] = df[sample_header_names].transform(lfilter_func) return df
def normalize_min_max_scaler(df: DataFrame) -> DataFrame: ''' Normalize dataframe by min and max doesn't take nan values :param df: ''' sample_header_names = [ h.name for h in process_header_data(df, HeaderType.SAMPLE) ] min_max_scaler = lambda x: preprocessing.minmax_scale(x) df[sample_header_names] = df[sample_header_names].transform(min_max_scaler) return df
def wiener_filter(df: DataFrame): ''' Apply the spline filter to the columns of the supplied data. The filter is only applied to columns that appear as samples in the default header dictionary. Modifications occur in-place. :param df: The data to filter :return: The resampled data ''' sample_header_names = [ h.name for h in process_header_data(df, HeaderType.SAMPLE) ] wiener_func = lambda x: wiener(x) df[sample_header_names] = df[sample_header_names].transform(wiener_func) return df
def replace_outliers(df: DataFrame, val: float64 = np.nan, num_std: float = 3) -> DataFrame: ''' Replace the outliers in the data on a column based calculation. The mean and standard deviation for each column is calculated to use. :param df: The data to replace outliers in :param val: The new value to use (the default is :data:`np.nan`) :param num_std: The number of standard deviations to use as a threshold :return: Data with values outside the threshold replaced ''' sample_header_names = [ h.name for h in process_header_data(df, HeaderType.SAMPLE) ] df[sample_header_names] = df[sample_header_names].transform( lambda s: replace(s, val, num_std)) return df
def savgol_smooth_filter(df: DataFrame, window_length: int = 7): ''' Apply the Savitzky-Golay filter to the columns of the supplied data. The filter is only applied to columns that appear as samples in the default header dictionary. Modifications occur in-place. :param df: The data to filter :return: The resampled data ''' if window_length % 2 == 0: # window_length must be odd window_length = window_length - 1 sample_header_names = [ h.name for h in process_header_data(df, HeaderType.SAMPLE) ] savgol_func = lambda x: savgol_filter(x, window_length, 1) df[sample_header_names] = df[sample_header_names].transform(savgol_func) return df