def check_values(dfs, threshold) -> bool: # column names and indices are assumed to be the same assert len(dfs) == 2 cols = dfs[0].columns for c in cols: # if the column in the 2 dataframes do not equal if not dfs[0][c].equals(dfs[1][c]): # get indices of diff elements (True is different) # this returns T/F vs numeric indices {0, ..., n-1} diff = dfs[0][c] != dfs[1][c] # get rows that have different values for this column df1 = dfs[0][diff][c] df2 = dfs[1][diff][c] # check how different the values are for i in df1.index: elems = [df1.loc[i], df2.loc[i]] try: # get values by index vals = [float(e) for e in elems] if abs(vals[0] - vals[1]) < threshold: Log.warn("abs(%f - %f) = %f < %f" % (vals[0], vals[1], abs(vals[0] - vals[1]), threshold)) pass else: Log.err("abs(%f - %f) = %f >= %f" % (vals[0], vals[1], abs(vals[0] - vals[1]), threshold)) except: Log.err("%s != %s" % (elems[0], elems[1])) return False return True
def check_column_is_in_dataframes(dfs, col_name) -> bool: """ check whether all given columns are in all given dataframes :param dfs: dataframes :param col_name: column names to look for :return: True is all columns are found in all dataframes; false otherwise """ if type(dfs) is not list: dfs = [dfs] if type(col_name) is not list: col_name = [col_name] # check col_name is a column for df in dfs: in_cols = set(col_name) df_cols = set(df.columns) # we only care all given column names are in DF ... ... diffs = in_cols - df_cols if diffs is not None and len(diffs) != 0: for c in diffs: Log.err("Col [%s] in List but not in DF" % c) return False return True
def plot_timeseries(dfs, idx_name, unit=None): """ Plot timeseries given curated dataframes :param dfs: list of dataframes - assumed to consist of idx column and all columns to plotted :param idx_name: column name of index, assumed to be some timestamp format. must exist in all dataframes :param unit: if index column is numeric, the time unit these values are in - ie ['D', 's', 'ms', 'us', 'ns'] :return: nothing - just pretty plots """ # convert single dataframe into list if type(dfs) is not list: dfs = [dfs] if dfh.check_column_is_in_dataframes(dfs, idx_name) is False: return # check if timestamps need formatting tmp = [df.copy() for df in dfs] # Infer Timestamp format sample = tmp[0].iloc[0][idx_name] if type(sample) == int and unit is not None and unit in [ 'D', 's', 'ms', 'us', 'ns' ]: # format timestamp in data for df in tmp: df[idx_name] = pd.to_datetime(df[idx_name], unit=unit) if unit in ['ms', 'us', 'ns']: fmt = '%Y/%m/%d %H:%M:%S.%f' elif unit == 's': fmt = '%Y/%m/%d %H:%M:%S' else: fmt = '%Y/%m/%d' elif type(sample) == str: # YYYY mm dd HH MM SS fff[ffffff] pattern_YYYYmmddHHMMSSfffffffff = '^([0-9]{4})[^0-9]{,1}([0-9]{2})[^0-9]{,1}([0-9]{2})[^0-9]{,1}([0-9]{2})[^0-9]{,1}([0-9]{2})[^0-9]{,1}([0-9]{2})[^0-9]{,1}([0-9]{3,})$' pattern_YYYYmmddHHMMSS = '^([0-9]{4})[^0-9]{,1}([0-9]{2})[^0-9]{,1}([0-9]{2})[^0-9]{,1}([0-9]{2})[^0-9]{,1}([0-9]{2})[^0-9]{,1}([0-9]{2})$' pattern_YYYYmmdd = '^([0-9]{4})[^0-9]{,1}([0-9]{2})[^0-9]{,1}([0-9]{2})$' result = re.match(pattern_YYYYmmddHHMMSSfffffffff, sample) if result is not None: fmt = '%Y/%m/%d %H:%M:%S.%f' else: result = re.match(pattern_YYYYmmddHHMMSS, sample) if result is not None: fmt = '%Y/%m/%d %H:%M:%S' else: result = re.match(pattern_YYYYmmdd, sample) if result is not None: fmt = '%Y/%m/%d' else: Log.err("Unknown format for index %s : [%s]" % (idx_name, sample)) # format timestamp in data for df in tmp: df[idx_name] = pd.to_datetime(df[idx_name], format=fmt) else: Log.err("Unknown dtype for %s : [%s]" % (idx_name, tmp[0][idx_name].dtype)) return # format timestamp in plot date_form = matplotlib.dates.DateFormatter(fmt) # i am going to ply everything you give me fig, ax = plt.subplots(figsize=(8.8, 8)) cols = list(dfs[0].columns) cols.remove(idx_name) for df in tmp: for c in cols: # fill nans - fwd then back while df[c].hasnans: df[c].fillna(method='ffill', inplace=True) df[c].fillna(method='bfill', inplace=True) # plot ax.plot(df[idx_name], df[c]) # set the date format ax.xaxis.set_major_formatter(date_form) plt.setp(ax.get_xticklabels(), rotation=30, ha="right", fontsize=8) # make bottom axis thicker to accommodate timestamp length plt.subplots_adjust(bottom=0.15) plt.legend(loc='best') plt.show()