Python remove_outliersの例、util.remove_outliers Pythonの例

コード例 #1

0

ファイルを表示

ファイル: rkhs_model_LMO_nystr_zoo_gammaexp.py プロジェクト: carolineyuchen/MMRIV

def summarize_res(sname, datasize):
    print(sname)
    res = []
    times = []
    for i in range(100):
        PATH = ROOT_PATH + "/MMR_IVs/results/zoo/" + sname + "/"
        filename = os.path.join(
            PATH, str(date.today()),
            'LMO_errs_{}_nystr_prodkern_{}.npy'.format(i, datasize))
        if os.path.exists(filename):
            tmp_res = np.load(filename, allow_pickle=True)
            if tmp_res[-1] is not None:
                res += [tmp_res[-1]]
        time_path = os.path.join(
            PATH, str(date.today()),
            '/LMO_errs_{}_nystr_prodkern_{}_time.npy'.format(i, datasize))
        if os.path.exists(time_path):
            t = np.load(time_path)
            times += [t]
    res = np.array(res)
    times = np.array(times)
    res = remove_outliers(res)
    times = np.sort(times)[:80]
    print(times)
    print('mean, std: ', np.mean(res), np.std(res))
    print('time: ', np.mean(times), np.std(times))

コード例 #2

0

ファイルを表示

ファイル: training.py プロジェクト: ahmad88me/TADA-NumCol

def compute_features_from_endpoint(model, class_property_uris,
                                   min_num_of_objects, update_func):
    """
    :param endpoint:
    :param class_property_uris:
    :param min_num_of_objects:
    :param update_func:
    :return: list of (cluster,feature) pairs
    """
    update_model_state(
        model=model,
        new_progress=0,
        new_notes="extracting values from gathered class/property")
    percentage_of_numerical = 0.5
    idx = 0
    total_num_of_queries = 0

    # calculate the required num of queries
    for class_uri in class_property_uris.keys():
        total_num_of_queries += len(class_property_uris[class_uri])

    for class_uri in class_property_uris.keys():
        logger.info("\nproperties for %s are %d" %
                    (class_uri, len(class_property_uris[class_uri])))
        for property_uri in class_property_uris[class_uri]:
            logger.debug("getting objects for: %s" % property_uri)
            raw_col = easysparql.get_objects(endpoint=model.knowledge_graph,
                                             class_uri=class_uri,
                                             property_uri=property_uri)

            if len(raw_col) > min_num_of_objects:
                col = get_numericals(column=raw_col)
                if len(col) > percentage_of_numerical * len(raw_col):
                    col = remove_outliers(col)
                    if len(col) > min_num_of_objects:
                        logger.debug("success: %s" % (property_uri))
                        # features_vector = features.compute_features(col, [features.mean, features.std, features.q1,
                        #                                                   features.q3])
                        features_vector = features.compute_curr_features(col)
                        cluster_name = "%s %s" % (class_uri, property_uri)
                        logger.debug("cluster: %s" % cluster_name)
                        modeling.add_cluster_to_model(model=model,
                                                      name=cluster_name,
                                                      features=features_vector)
                    else:
                        logger.debug("\n***%s only has %d values" %
                                     (property_uri, len(col)))
                else:
                    logger.debug("%s only has %d/%d numerical values" %
                                 (property_uri, len(raw_col), len(col)))
            update_func(int(idx * 1.0 / total_num_of_queries * 100))
            idx += 1

コード例 #3

0

ファイルを表示

def summarize_res(sname, datasize):
    print(sname)
    res = []
    times = []
    for i in range(100):
        PATH = ROOT_PATH + "/our_methods/results/mendelian/" + sname + "/"
        filename = PATH + 'LMO_errs_{}_nystr_{}.npy'.format(i, datasize)
        if os.path.exists(filename):
            tmp_res = np.load(filename, allow_pickle=True)
            if tmp_res[-1] is not None:
                res += [tmp_res[-1]]
        time_path = PATH + '/LMO_errs_{}_nystr_{}_time.npy'.format(i, datasize)
        if os.path.exists(time_path):
            t = np.load(time_path)
            times += [t]
    res = np.array(res)
    res = remove_outliers(res)
    print('mean, std: ', np.mean(res), np.std(res))

コード例 #4

0

ファイルを表示

gamma = 0.95
batch_size = 20
eval_batch_size = 100

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device", device)

path = sys.argv[1]
print("dataset", path)
train_data = np.load(path)[:, :NUM_FEAT].astype(np.float32)

# normalize
train_data -= train_data.mean(axis=0, keepdims=True)
train_data /= train_data.std(axis=0, keepdims=True)

train_data = remove_outliers(train_data)

trivial_loss = np.mean((train_data[1:] - train_data[:-1])**2)
print(f"Trivial loss (predicting no changes): {trivial_loss}")

all_data = torch.tensor(train_data).float().to(device)

n_split = len(all_data) // 2

train_data = all_data[:n_split]
val_data = all_data[-n_split:]
test_data = train_data


def batchify(data, bsz):
    # Divide the dataset into bsz parts.

コード例 #5

0

ファイルを表示

import numpy as np

today = datetime.today()
print("Today", today)

dates_list = util.create_date_list(today)

strql = util.convert_date_list_to_string(dates_list)

interval = 2
timelist = util.create_timelist(interval, today)

df = dbdao.retrieve_database(strql)

dataarray = util.create_dataarray_from_dataframe(df, timelist, today)

mydf = util.remove_outliers(dataarray)

training_set = mydf.values

training_set_scaled = modelutil.scale_training_set(training_set)

X_train, y_train = modelutil.create_timesteps(training_set_scaled)

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

regressor = modelutil.buid_model(X_train)

regressor.fit(X_train, y_train, epochs=100, batch_size=32)

dbdao.save_model_in_database(regressor)

コード例 #6

0

ファイルを表示

ファイル: rkhs_model_LMO_nystr_mnist.py プロジェクト: RuiZhang2016/MMRIV

                   jac=True,
                   options={
                       'maxiter': 5000,
                       'disp': True,
                       'ftol': 0
                   },
                   callback=callback0)
    PATH = ROOT_PATH + "/MMR_IVs/results/" + sname + "/"
    os.makedirs(PATH, exist_ok=True)
    np.save(PATH + 'LMO_errs_{}_nystr.npy'.format(seed),
            [opt_params, prev_norm, opt_test_err])


if __name__ == '__main__':
    snames = ['mnist_z', 'mnist_x', 'mnist_xz']
    for sname in snames:
        for seed in range(100):
            experiment(sname, seed)

        PATH = ROOT_PATH + "/MMR_IVs/results/" + sname + "/"
        ress = []
        for seed in range(100):
            filename = PATH + 'LMO_errs_{}_nystr.npy'.format(seed)
            if os.path.exists(filename):
                res = np.load(filename, allow_pickle=True)
                if res[-1] is not None:
                    ress += [res[-1]]
        ress = np.array(ress)
        ress = remove_outliers(ress)
        print(np.nanmean(ress), np.nanstd(ress))

コード例 #7

0

ファイルを表示

g = 1


# Get the running times t_i for m_i^x for the messages m_i in M
def measure_times(M, x, n, number_of_runs_per_message=1):
    return np.array(
        [rt3x_average(powmod, m, x, n, number_of_runs_per_message) for m in M])


TT = measure_times(M, d, n, averages)  # These are the T_i's
tt0 = measure_times(M, g, n, averages)  # These are the t_i's when g = 0b1

g = set_bit_n(g, 1)  # set g to 0b11
tt1 = measure_times(M, g, n, averages)  # these are the t_i's when g is 0b11

D0 = remove_outliers(TT - tt0)
D1 = remove_outliers(TT - tt1)

# Compute the standard deviations for the time differences
(sd0, sd1) = map(std, (D0, D1))

print("\nStandard deviation of time differences: ", sd0, sd1)

print("\nIn binary, d = ", bin(d))

num_bins = 32

nn, bins, patches = plt.hist(D0,
                             num_bins,
                             facecolor='blue',
                             alpha=0.5,

コード例 #8

0

ファイルを表示

                  c=dataset['target'],
                  cmap=plt.cm.Set1,
                  edgecolor='k')
axs[3, 1].scatter(dataset.iloc[:, 3],
                  dataset.iloc[:, 1],
                  c=dataset['target'],
                  cmap=plt.cm.Set1,
                  edgecolor='k')
axs[3, 2].scatter(dataset.iloc[:, 3],
                  dataset.iloc[:, 2],
                  c=dataset['target'],
                  cmap=plt.cm.Set1,
                  edgecolor='k')

# remove outliers
new_dataset = util.remove_outliers(dataset)

# look at skew and mean of new dataset without outliers
f.write("\nAfter removing outliers\n")
util.get_data_stats(new_dataset, f)

plt.figure(2)
scatter = plt.scatter(new_dataset.iloc[:, 0],
                      new_dataset.iloc[:, 1],
                      c=new_dataset['target'],
                      cmap=plt.cm.Set1,
                      edgecolor='k')
plt.legend(*scatter.legend_elements(), loc="upper right", title="Class")
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')