def build_ladder(df, size_standard, label_name): choices, std = reduce_choices(df, label_name) ss = np.array(size_standard) if len(choices) < len(size_standard): print('\tWARNING: len(choices) = {}, k = {}'.format( len(choices), len(size_standard))) X = np.array( [sorted(list(c)) for c in combinations(choices, len(size_standard))]) # print('\t{} choose {} -> {:,} combos'.format(len(choices), len(size_standard), len(X))) pfit_zx = np.polyfit(ss, X.T, deg=1, full=True) residuals_zx = pfit_zx[1] X_mean = np.expand_dims(np.mean(X, axis=1), axis=1) R_sq_zx = 1.0 - (np.square(residuals_zx) / np.sum(np.square(X - X_mean))) # i = np.argmax(R_sq_zx) ranked_R_sq, indices = np.unique(R_sq_zx, return_index=True) indices = indices.tolist() indices.reverse() for i in indices: ladder = X[i] Y = df[ladder] # print('len(ladder) = {}'.format(len(ladder))) Ygrubb = grubbs.test(Y.tolist(), alpha=0.05) if len(Y) == len(Ygrubb): break return ladder
def grubbs_out(df, parameter): df_r = df[['记录时间']] for name in parameter: print('正在去除({})参数的异常值'.format(name)) df_i = pd.DataFrame(grubbs.test(df[name], alpha=0.05)) df_r = df_r.join(df_i, how='outer') print('丢弃所有的空值') return df_r.dropna()
def extract_wave_tracer_features(df): df_columns = ['ImageNumber', 'total_movement'] dataTab_noOutliers = pd.DataFrame() data_df = pd.DataFrame() global_df = pd.DataFrame() samples = vec = dfRes = [] total_length = 0 if BINNING_TYPE == "freedman_std": for ROI, data in df.groupby('ImageNumber'): dataTab = [] for ROI2, data2 in data.groupby('WaveTracerID'): dataTab.append(len(data2['WaveTracerID'])) print ROI data4grubbs = pd.Series(dataTab) total_length += len(data4grubbs.index) grubbsResult = grubbs.test(data4grubbs, alpha=0.05) vec = len(grubbsResult) * [ROI] dataTab_noOutliers = pd.DataFrame({ 'ImageNumber': vec, 'total_movement': grubbsResult.tolist() }) data_df = data_df.append(dataTab_noOutliers, ignore_index=True) print "SIZE BEFORE GRUBBS : " + str(total_length) print "SIZE AFTER GRUBBS : " + str(len(data_df.index)) print "TRAJECTORY REMOVED : " + str(total_length - len(data_df.index)) print "TRAJECTORY Keeped : " + str( (len(data_df.index) * 100) / total_length) + "%" # print "MAX TRAJECTORY SIZE (afg): "+str(data_df['total_movement'].max()) data_df['total_movement'].plot(kind='line') da_global, bins_global = freedman_bin_width(data_df['total_movement'], True) WTRACER_MIN = bins_global[0] WTRACER_MAX = bins_global[len(bins_global) - 1] WTRACER_HISTOGRAM_BINS = bins_global WTRACER_HISTOGRAM_LABELS = [ "HIST_WTRACER_%f" % _ for _ in WTRACER_HISTOGRAM_BINS[:-1] ] print WTRACER_HISTOGRAM_LABELS for ROI, data in data_df.groupby('ImageNumber'): wtracer_features = generate_feature_vector( data['total_movement'], WTRACER_MIN, WTRACER_MAX, WTRACER_HISTOGRAM_BINS, WTRACER_HISTOGRAM_LABELS) wtracer_features['index'] = ROI wtracer_features = wtracer_features.set_index('index') samples.append(wtracer_features) pd.concat(samples).to_csv(os.path.join(OUTPUT_DIR, "samplesWaveTracer.csv"), sep=",") return (pd.concat(samples), WTRACER_HISTOGRAM_LABELS)
def comparisonPlot(logname, dirs, folder, warm_start, mean_size=0): """ Creates a comparative plot between multiple runs, to find optimizing hyperparameter sets. @param logname: string, the name of the logfile to be compared @param dirs: string list, contains the names of the folders containing the logfiles for each run @param folder: string, name of the folder to contain the comparative results @param warm_start: int, drops the first warm_start data points to allow useful visual presentation @param mean_size: int, builds a mean point over this many data points """ fig = go.Figure() xstring = "Epoch x100" ystring = "Value" fig.update_layout( title=go.layout.Title(text=logname, xref="paper", x=0), xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text=xstring, )), yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text=ystring, )), font=dict(family="Arial", size=16, color="#505050")) for i in range(0, len(dirs)): dataframe = pd.read_csv("results/" + dirs[i] + "/logs/" + logname + "_log.txt", header=None, index_col=False) data = dataframe.to_numpy().flatten() data = data[warm_start:] # Ignore first warm_start epochs data = grubbs.test(data, alpha=1.0) # 0.999) # Removes outliers if mean_size != 0: remainder = len(data) % mean_size while remainder != 0 and remainder != mean_size: data = np.append(data, data[-1]) remainder += 1 spline_data = np.mean(data.reshape(-1, mean_size), axis=1) fig.add_trace( go.Scatter(x=list(range(0, len(data))[::mean_size]), y=spline_data, name=dirs[i], line_shape='linear')) # line_shape='spline' else: datasum = 0 for d in data: datasum += d avg = datasum / len(data) for idx in range(len(data)): if data[idx] > avg * 100: data[idx] = (data[idx - 1] + data[idx + 1]) / 2 # Avg fig.add_trace( go.Scatter(x=list(range(0, len(data))), y=data, mode="lines", name=dirs[i])) plotly.offline.plot(fig, filename="results/" + folder + "/comparison_" + logname + ".html") # includes fig.show()
def filter_numeric_data(data_frame, parameter): """Removes NaN from any pandas Data Frame and performs Grubbs' test to check for outliers, removing them. By default, it uses alpha=0.05 for removal of outliers. Arguments --- data_frame : pd.DataFrame Data frame to be parsed parameter : str Name of column in data_frame to be checked Returns --- grubbs.test : pd.Series Result from Grubbs' test, telling about outliers removed""" if parameter not in data_frame.columns: raise AttributeError(f"Could not find {parameter} in data frame.") filter_nan = data_frame[pd.notnull( data_frame[parameter])] # filtering NaN data data_series = pd.Series(filter_nan[parameter]) data_series.reset_index(drop=True, inplace=True) # index must be reset to work try: data = pd.to_numeric( data_series) # conversion necessary to use Grubbs' test except ValueError: raise ValueError( "Grubbs' test cannot be performed due to non-numeric data. Please check your data." ) # There seems to be an issue with the smirnov_grubbs.py when handling # certain large outliers, and I could not figure out why. This is an error # handling to cope with this issue try: return grubbs.test(data, alpha=0.05) except KeyError: raise KeyError( """Grubbs' test cannot be performed due to input data. Try removing manually any outlier that is largely deviating from the distribution""")
def abnormal_x6_x8(l_data, data): D_value = [] for item_x6 in range(len(data) - 1): D_value.append( abs( float(data.iloc[item_x6].values) - float(data.iloc[item_x6 + 1].values))) data_x6 = set(grubbs.test(D_value, alpha=0.01)) # print(temp) # all_right_data = func_jump_error((data_series)) # all_right_data = list(temp) data_x6_remove = list(set(D_value).difference(data_x6)) # return data_x6_remove if (len(data_x6_remove) > 0): return 8 else: return 6 #d但是我感觉这个只能适用于某一个范围,不能适用于整个窗口,至少不能存在一个就判定为抖动
def func_jump_error(value): #value是单维的数据 # return list(temp) data_copy = value.copy() data_series = pd.Series(value['b'].values, index=value['b'].index) temp = set(grubbs.test(data_series, alpha=0.01)) # print(temp) # all_right_data = func_jump_error((data_series)) all_right_data = list(temp) for item_right in all_right_data: data_copy = data_copy[~data_copy['b'].isin([item_right])] # print(data_copy) # data_copy 表示离群点 # jump_error = set(value) - set(list(grubbs.test(value,alpha=0.01))) # return list(jump_error) return data_copy
def abnormal_x6_x8(data, data_list): #两者的共同点 if sigma_(data) == True: return 0 # if data.std().values[0]<data.mean().values[0]*0.2:#如果存在的短时连续突增或突降小于某个值,则不进行x6,x8判定 # return 0 #两者的区分 D_value = [] for item_x6 in range(len(data_list) - 1): if (data_list[item_x6] == None) or (data_list[item_x6 + 1] == None): continue D_value.append( abs(float(data_list[item_x6]) - float(data_list[item_x6 + 1]))) if len(D_value) == 0: return 0 else: data_x6 = set(grubbs.test(D_value, alpha=0.01)) data_x6_remove = list(set(D_value).difference(data_x6)) if (len(data_x6_remove) > 0): return 8
def extract_wave_tracer_features(df): df_columns = ['ImageNumber', 'total_movement'] dataTab_noOutliers = pd.DataFrame() data_df = pd.DataFrame() global_df = pd.DataFrame() samples = vec = dfRes= [] total_length=0 if BINNING_TYPE=="freedman_std": for ROI,data in df.groupby('ImageNumber'): dataTab = [] for ROI2,data2 in data.groupby('WaveTracerID'): dataTab.append(len(data2['WaveTracerID'])) print ROI data4grubbs = pd.Series(dataTab) total_length+=len(data4grubbs.index) grubbsResult=grubbs.test(data4grubbs, alpha=0.05) vec = len(grubbsResult)*[ROI] dataTab_noOutliers = pd.DataFrame({'ImageNumber' : vec, 'total_movement' : grubbsResult.tolist()}) data_df= data_df.append(dataTab_noOutliers, ignore_index=True) print "SIZE BEFORE GRUBBS : " + str(total_length) print "SIZE AFTER GRUBBS : "+str(len(data_df.index)) print "TRAJECTORY REMOVED : "+str(total_length-len(data_df.index)) print "TRAJECTORY Keeped : "+str((len(data_df.index)*100)/total_length)+"%" # print "MAX TRAJECTORY SIZE (afg): "+str(data_df['total_movement'].max()) data_df['total_movement'].plot(kind='line') da_global,bins_global = freedman_bin_width(data_df['total_movement'], True) WTRACER_MIN=bins_global[0] WTRACER_MAX=bins_global[len(bins_global)-1] WTRACER_HISTOGRAM_BINS=bins_global WTRACER_HISTOGRAM_LABELS = ["HIST_WTRACER_%f" % _ for _ in WTRACER_HISTOGRAM_BINS[:-1]] print WTRACER_HISTOGRAM_LABELS for ROI, data in data_df.groupby('ImageNumber'): wtracer_features = generate_feature_vector(data['total_movement'],WTRACER_MIN,WTRACER_MAX,WTRACER_HISTOGRAM_BINS,WTRACER_HISTOGRAM_LABELS) wtracer_features['index'] = ROI wtracer_features = wtracer_features.set_index('index') samples.append(wtracer_features) pd.concat(samples).to_csv(os.path.join(OUTPUT_DIR, "samplesWaveTracer.csv"),sep=",") return (pd.concat(samples),WTRACER_HISTOGRAM_LABELS)
def grubbsfunc(ratiodff, columns): """ Take a dataframe of RLU values for replicates, find any outliers based on the Grubb's test and return modified """ ratiodict = {} for i in xrange(len(ratiodff)): dfempty = pd.DataFrame.transpose( pd.DataFrame(np.nan, index=[i + 1], columns=columns)) data = ratiodff.loc[int(i + 1)] if len(list(data)) > 2: ##impossible to detect outliers in 2 or less. dfempty.update(grubbs.test(data, alpha=0.05)) z = dfempty.to_dict() ratiodict.update(z) else: dfempty.update(data) z = dfempty.to_dict() ratiodict.update(z) ratiodf = pd.DataFrame.from_dict(ratiodict).transpose() return ratiodf
def X8_X9(data): # data_copy = data.values[:,1].copy() # data_series = pd.Series(value['b'].values, index=value['b'].index) temp = set(grubbs.test(data.values[:, 1], alpha=0.01)) # print(temp) count = 0 flag8, flag9 = 0, 0 # all_right_data = func_jump_error((data_series)) all_right_data = list(temp) for item_right in all_right_data: if item_right not in data.values[:, 1]: count += 1 if count < 10: flag9 = 1 dict_rate['X9'] = count / len(data) if count >= 10: flag8 = 1 dict_rate['X8'] = count / len(data) if flag9 == 1: dict_rate['X8'] = 0 if flag8 == 1: dict_rate['X9'] = 0 return dict_rate
def reject_outliers(self, val_array): results = grubbs.test(val_array, alpha=0.05) return results
import json import plotly.graph_objects as go from outliers import smirnov_grubbs as grubbs with open('./boxplotNumberComments.json') as f: data = json.load(f) fig = go.Figure() fig.add_trace(go.Box(y=grubbs.test(data, alpha=0.20), name='comments')) fig.show()