コード例 #1
0
def build_ladder(df, size_standard, label_name):
    choices, std = reduce_choices(df, label_name)
    ss = np.array(size_standard)
    if len(choices) < len(size_standard):
        print('\tWARNING: len(choices) = {}, k = {}'.format(
            len(choices), len(size_standard)))
    X = np.array(
        [sorted(list(c)) for c in combinations(choices, len(size_standard))])
    # print('\t{} choose {} -> {:,} combos'.format(len(choices), len(size_standard), len(X)))
    pfit_zx = np.polyfit(ss, X.T, deg=1, full=True)
    residuals_zx = pfit_zx[1]
    X_mean = np.expand_dims(np.mean(X, axis=1), axis=1)
    R_sq_zx = 1.0 - (np.square(residuals_zx) / np.sum(np.square(X - X_mean)))
    # i = np.argmax(R_sq_zx)
    ranked_R_sq, indices = np.unique(R_sq_zx, return_index=True)
    indices = indices.tolist()
    indices.reverse()
    for i in indices:
        ladder = X[i]
        Y = df[ladder]
        # print('len(ladder) = {}'.format(len(ladder)))
        Ygrubb = grubbs.test(Y.tolist(), alpha=0.05)
        if len(Y) == len(Ygrubb):
            break
    return ladder
コード例 #2
0
ファイル: train.py プロジェクト: weather319/SRNN
def grubbs_out(df, parameter):
    df_r = df[['记录时间']]
    for name in parameter:
        print('正在去除({})参数的异常值'.format(name))
        df_i = pd.DataFrame(grubbs.test(df[name], alpha=0.05))
        df_r = df_r.join(df_i, how='outer')
    print('丢弃所有的空值')
    return df_r.dropna()
コード例 #3
0
ファイル: readfile.py プロジェクト: dry-lab/SuperClass
def extract_wave_tracer_features(df):

    df_columns = ['ImageNumber', 'total_movement']
    dataTab_noOutliers = pd.DataFrame()
    data_df = pd.DataFrame()
    global_df = pd.DataFrame()
    samples = vec = dfRes = []
    total_length = 0

    if BINNING_TYPE == "freedman_std":
        for ROI, data in df.groupby('ImageNumber'):
            dataTab = []
            for ROI2, data2 in data.groupby('WaveTracerID'):
                dataTab.append(len(data2['WaveTracerID']))
            print ROI
            data4grubbs = pd.Series(dataTab)
            total_length += len(data4grubbs.index)
            grubbsResult = grubbs.test(data4grubbs, alpha=0.05)
            vec = len(grubbsResult) * [ROI]
            dataTab_noOutliers = pd.DataFrame({
                'ImageNumber':
                vec,
                'total_movement':
                grubbsResult.tolist()
            })
            data_df = data_df.append(dataTab_noOutliers, ignore_index=True)

        print "SIZE BEFORE GRUBBS : " + str(total_length)
        print "SIZE AFTER GRUBBS : " + str(len(data_df.index))
        print "TRAJECTORY REMOVED : " + str(total_length - len(data_df.index))
        print "TRAJECTORY Keeped : " + str(
            (len(data_df.index) * 100) / total_length) + "%"
        # print "MAX TRAJECTORY SIZE (afg): "+str(data_df['total_movement'].max())
        data_df['total_movement'].plot(kind='line')

        da_global, bins_global = freedman_bin_width(data_df['total_movement'],
                                                    True)
        WTRACER_MIN = bins_global[0]
        WTRACER_MAX = bins_global[len(bins_global) - 1]
        WTRACER_HISTOGRAM_BINS = bins_global
        WTRACER_HISTOGRAM_LABELS = [
            "HIST_WTRACER_%f" % _ for _ in WTRACER_HISTOGRAM_BINS[:-1]
        ]
        print WTRACER_HISTOGRAM_LABELS

        for ROI, data in data_df.groupby('ImageNumber'):
            wtracer_features = generate_feature_vector(
                data['total_movement'], WTRACER_MIN, WTRACER_MAX,
                WTRACER_HISTOGRAM_BINS, WTRACER_HISTOGRAM_LABELS)
            wtracer_features['index'] = ROI
            wtracer_features = wtracer_features.set_index('index')
            samples.append(wtracer_features)
        pd.concat(samples).to_csv(os.path.join(OUTPUT_DIR,
                                               "samplesWaveTracer.csv"),
                                  sep=",")

    return (pd.concat(samples), WTRACER_HISTOGRAM_LABELS)
コード例 #4
0
def comparisonPlot(logname, dirs, folder, warm_start, mean_size=0):
    """
	Creates a comparative plot between multiple runs, to find optimizing hyperparameter sets.
	@param logname: string, the name of the logfile to be compared
	@param dirs: string list, contains the names of the folders containing the logfiles for each run
	@param folder: string, name of the folder to contain the comparative results
	@param warm_start: int, drops the first warm_start data points to allow useful visual presentation
	@param mean_size: int, builds a mean point over this many data points
	"""
    fig = go.Figure()
    xstring = "Epoch x100"
    ystring = "Value"
    fig.update_layout(
        title=go.layout.Title(text=logname, xref="paper", x=0),
        xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text=xstring, )),
        yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text=ystring, )),
        font=dict(family="Arial", size=16, color="#505050"))

    for i in range(0, len(dirs)):
        dataframe = pd.read_csv("results/" + dirs[i] + "/logs/" + logname +
                                "_log.txt",
                                header=None,
                                index_col=False)
        data = dataframe.to_numpy().flatten()
        data = data[warm_start:]  # Ignore first warm_start epochs
        data = grubbs.test(data, alpha=1.0)  # 0.999) # Removes outliers
        if mean_size != 0:
            remainder = len(data) % mean_size
            while remainder != 0 and remainder != mean_size:
                data = np.append(data, data[-1])
                remainder += 1

            spline_data = np.mean(data.reshape(-1, mean_size), axis=1)
            fig.add_trace(
                go.Scatter(x=list(range(0, len(data))[::mean_size]),
                           y=spline_data,
                           name=dirs[i],
                           line_shape='linear'))  # line_shape='spline'
        else:
            datasum = 0
            for d in data:
                datasum += d
            avg = datasum / len(data)
            for idx in range(len(data)):
                if data[idx] > avg * 100:
                    data[idx] = (data[idx - 1] + data[idx + 1]) / 2  # Avg
            fig.add_trace(
                go.Scatter(x=list(range(0, len(data))),
                           y=data,
                           mode="lines",
                           name=dirs[i]))

    plotly.offline.plot(fig,
                        filename="results/" + folder + "/comparison_" +
                        logname + ".html")  # includes fig.show()
コード例 #5
0
def filter_numeric_data(data_frame, parameter):
    """Removes NaN from any pandas Data Frame and performs 
    Grubbs' test to check for outliers, removing them. By
    default, it uses alpha=0.05 for removal of outliers.

    Arguments
    ---

    data_frame : pd.DataFrame 
        Data frame to be parsed

    parameter : str
        Name of column in data_frame to be checked

    Returns
    ---

    grubbs.test : pd.Series
        Result from Grubbs' test, telling about outliers removed"""

    if parameter not in data_frame.columns:
        raise AttributeError(f"Could not find {parameter} in data frame.")

    filter_nan = data_frame[pd.notnull(
        data_frame[parameter])]  # filtering NaN data
    data_series = pd.Series(filter_nan[parameter])
    data_series.reset_index(drop=True,
                            inplace=True)  # index must be reset to work

    try:
        data = pd.to_numeric(
            data_series)  # conversion necessary to use Grubbs' test

    except ValueError:
        raise ValueError(
            "Grubbs' test cannot be performed due to non-numeric data. Please check your data."
        )

    # There seems to be an issue with the smirnov_grubbs.py when handling
    # certain large outliers, and I could not figure out why. This is an error
    # handling to cope with this issue
    try:
        return grubbs.test(data, alpha=0.05)

    except KeyError:
        raise KeyError(
            """Grubbs' test cannot be performed due to input data. Try
                       removing manually any outlier that is largely deviating
                       from the distribution""")
コード例 #6
0
def abnormal_x6_x8(l_data, data):
    D_value = []
    for item_x6 in range(len(data) - 1):
        D_value.append(
            abs(
                float(data.iloc[item_x6].values) -
                float(data.iloc[item_x6 + 1].values)))
    data_x6 = set(grubbs.test(D_value, alpha=0.01))
    # print(temp)
    # all_right_data = func_jump_error((data_series))
    # all_right_data = list(temp)
    data_x6_remove = list(set(D_value).difference(data_x6))
    # return data_x6_remove
    if (len(data_x6_remove) > 0): return 8
    else: return 6  #d但是我感觉这个只能适用于某一个范围,不能适用于整个窗口,至少不能存在一个就判定为抖动
コード例 #7
0
def func_jump_error(value):
    #value是单维的数据

    # return list(temp)
    data_copy = value.copy()
    data_series = pd.Series(value['b'].values, index=value['b'].index)
    temp = set(grubbs.test(data_series, alpha=0.01))
    # print(temp)
    # all_right_data = func_jump_error((data_series))
    all_right_data = list(temp)
    for item_right in all_right_data:
        data_copy = data_copy[~data_copy['b'].isin([item_right])]
    # print(data_copy)  # data_copy 表示离群点
    # jump_error = set(value) - set(list(grubbs.test(value,alpha=0.01)))
    # return list(jump_error)
    return data_copy
コード例 #8
0
ファイル: model_one.py プロジェクト: usernamezcn/Nanjing
def abnormal_x6_x8(data, data_list):
    #两者的共同点
    if sigma_(data) == True: return 0
    # if data.std().values[0]<data.mean().values[0]*0.2:#如果存在的短时连续突增或突降小于某个值,则不进行x6,x8判定
    #     return 0
    #两者的区分
    D_value = []
    for item_x6 in range(len(data_list) - 1):
        if (data_list[item_x6] == None) or (data_list[item_x6 + 1] == None):
            continue
        D_value.append(
            abs(float(data_list[item_x6]) - float(data_list[item_x6 + 1])))
    if len(D_value) == 0: return 0
    else: data_x6 = set(grubbs.test(D_value, alpha=0.01))
    data_x6_remove = list(set(D_value).difference(data_x6))
    if (len(data_x6_remove) > 0): return 8
コード例 #9
0
ファイル: readfile.py プロジェクト: cbib/SuperClass
def extract_wave_tracer_features(df): 

    df_columns = ['ImageNumber', 'total_movement']
    dataTab_noOutliers = pd.DataFrame()
    data_df = pd.DataFrame()
    global_df = pd.DataFrame()
    samples = vec = dfRes= []
    total_length=0

    if BINNING_TYPE=="freedman_std":
        for ROI,data in df.groupby('ImageNumber'):
            dataTab = []
            for ROI2,data2 in data.groupby('WaveTracerID'):
                dataTab.append(len(data2['WaveTracerID']))
            print ROI
            data4grubbs = pd.Series(dataTab)
            total_length+=len(data4grubbs.index)
            grubbsResult=grubbs.test(data4grubbs, alpha=0.05)
            vec = len(grubbsResult)*[ROI]
            dataTab_noOutliers = pd.DataFrame({'ImageNumber' : vec, 'total_movement' : grubbsResult.tolist()})
            data_df= data_df.append(dataTab_noOutliers, ignore_index=True)


        print "SIZE BEFORE GRUBBS : " + str(total_length)
        print "SIZE AFTER GRUBBS : "+str(len(data_df.index))
        print "TRAJECTORY REMOVED : "+str(total_length-len(data_df.index))
        print "TRAJECTORY Keeped : "+str((len(data_df.index)*100)/total_length)+"%"
        # print "MAX TRAJECTORY SIZE (afg): "+str(data_df['total_movement'].max())
        data_df['total_movement'].plot(kind='line')

        da_global,bins_global = freedman_bin_width(data_df['total_movement'], True)
        WTRACER_MIN=bins_global[0]
        WTRACER_MAX=bins_global[len(bins_global)-1]
        WTRACER_HISTOGRAM_BINS=bins_global
        WTRACER_HISTOGRAM_LABELS = ["HIST_WTRACER_%f" % _ for _ in WTRACER_HISTOGRAM_BINS[:-1]]
        print WTRACER_HISTOGRAM_LABELS

        for ROI, data in data_df.groupby('ImageNumber'):
            wtracer_features = generate_feature_vector(data['total_movement'],WTRACER_MIN,WTRACER_MAX,WTRACER_HISTOGRAM_BINS,WTRACER_HISTOGRAM_LABELS)
            wtracer_features['index'] = ROI
            wtracer_features = wtracer_features.set_index('index')
            samples.append(wtracer_features)
        pd.concat(samples).to_csv(os.path.join(OUTPUT_DIR, "samplesWaveTracer.csv"),sep=",")

    return (pd.concat(samples),WTRACER_HISTOGRAM_LABELS)
コード例 #10
0
def grubbsfunc(ratiodff, columns):
    """ Take a dataframe of RLU values for replicates, find any outliers based on the Grubb's test and return modified """
    ratiodict = {}
    for i in xrange(len(ratiodff)):
        dfempty = pd.DataFrame.transpose(
            pd.DataFrame(np.nan, index=[i + 1], columns=columns))
        data = ratiodff.loc[int(i + 1)]

        if len(list(data)) > 2:  ##impossible to detect outliers in 2 or less.

            dfempty.update(grubbs.test(data, alpha=0.05))
            z = dfempty.to_dict()
            ratiodict.update(z)

        else:
            dfempty.update(data)
            z = dfempty.to_dict()
            ratiodict.update(z)
    ratiodf = pd.DataFrame.from_dict(ratiodict).transpose()

    return ratiodf
def X8_X9(data):
    # data_copy = data.values[:,1].copy()
    # data_series = pd.Series(value['b'].values, index=value['b'].index)
    temp = set(grubbs.test(data.values[:, 1], alpha=0.01))
    # print(temp)
    count = 0
    flag8, flag9 = 0, 0
    # all_right_data = func_jump_error((data_series))
    all_right_data = list(temp)
    for item_right in all_right_data:
        if item_right not in data.values[:, 1]:
            count += 1
    if count < 10:
        flag9 = 1
        dict_rate['X9'] = count / len(data)
    if count >= 10:
        flag8 = 1
        dict_rate['X8'] = count / len(data)
    if flag9 == 1:
        dict_rate['X8'] = 0
    if flag8 == 1:
        dict_rate['X9'] = 0
    return dict_rate
コード例 #12
0
 def reject_outliers(self, val_array):
     results = grubbs.test(val_array, alpha=0.05)
     return results
コード例 #13
0
import json
import plotly.graph_objects as go
from outliers import smirnov_grubbs as grubbs

with open('./boxplotNumberComments.json') as f:
    data = json.load(f)
fig = go.Figure()
fig.add_trace(go.Box(y=grubbs.test(data, alpha=0.20), name='comments'))
fig.show()