Python over_sampling Examples, utils.over_sampling Python Examples

Example #1

0

Show file

def preprocess_sachs(folder_path,
                     save_path,
                     num_envs,
                     normalization='standard'):
  """Preprocesses Sachs.

  Args:
    folder_path: Read sachs from the path
    save_path: Save data in this path
    num_envs: The number of environments to cluster the data into
    normalization: Normalization option

  Returns:
    The sachs dataset with different envs [num_envs, number of sample in
    each envs, num of features]

  """

  np.set_printoptions(precision=3)
  X = read_sachs_all(folder_path)
  _, d = X.shape
  kmeans = KMeans(n_clusters=num_envs, max_iter=1000).fit(X)
  labeled_X = kmeans.fit_predict(X)
  X_envs = utils.classify_x(X, labeled_X)  # X_cluster is a dict
  X, Y = utils.preprocess_labeled_data(X, labeled_X, normalization)
  X_res, Y_res = utils.over_sampling(X, Y)
  X_envs = utils.classify_x(X_res, Y_res)

  os.makedirs(save_path, exist_ok=True)
  for i, X in X_envs.items():
    exp = save_path + f'sachs_env_{i+1}_{num_envs}.csv'
    if X.shape[0] > 1:
      np.savetxt(exp, X, delimiter=',')

  return utils.distribute_to_envs(X_envs)

Example #2

0

Show file

def preprocess_BH(save_path, cluster):
  """Clusters the data and prerpocess it.

  Args:
    save_path: the path to save the proprocessed data
    cluster: number of clusters to cluster the dataset

  Returns:
    A tensor of [num_envs, number of sample in each envs, num of features]
  """
  np.set_printoptions(precision=3)

  x_raw = utils.load_BH()
  label = np.ones(len(x_raw))

  # only train and test
  x, x_test, _, _ = train_test_split(
      x_raw, label, test_size=0.1, random_state=42)

  # train val and test
  # X_train_val, X_test, _, _ = train_test_split(
  #  X_raw, label, test_size=0.1, random_state=42)
  # X, X_val, _, _ = train_test_split(
  # X_train_val, np.ones(len(X_train_val)), test_size=0.1, random_state=1)

  n, d = x.shape
  kmeans = KMeans(n_clusters=cluster, max_iter=1000).fit(x)
  labeled_x = kmeans.fit_predict(x)
  x_cluster = utils.classify_x(x, labeled_x)  # X_cluster is a dict

  os.makedirs(save_path, exist_ok=True)

  x_upsampled, y_upsampled = utils.over_sampling(x, labeled_x)
  x_envs = utils.classify_x(x_upsampled, y_upsampled)
  standard_scaler = preprocessing.StandardScaler()

  x_train = list()
  i = 1

  # Save the data for different envs
  for x_env in x_envs.items():
    standardx = standard_scaler.fit_transform(x_env[1])
    exp = save_path + f'standard_BH_env_{i}_{cluster}.csv'
    np.savetxt(exp, standardx, fmt='%.3f', delimiter=',')
    i += 1
    x_train.append(standardx)

  # Standard the train test dataset using the mean and std of the train dataset
  standard_train_param = standard_scaler.fit(x)
  standardxtrain = standard_scaler.transform(x)
  x_trainexp = save_path + f'standard_BH_train.csv'
  np.savetxt(x_trainexp, standardxtrain, fmt='%.3f', delimiter=',')

  standardxtest = standard_scaler.transform(x_test)
  x_testexp = save_path + f'standard_BH_test.csv'
  np.savetxt(x_testexp, standardxtest, fmt='%.3f', delimiter=',')

  return np.stack(x_train), x_testexp

Example #3

0

Show file

File: main.py Project: ZhangHaoYuan1204/pulsarML

def main():
    loader = DataLoader(config.pos_path, config.nag_path)  # 正负样本生成器
    train_data, labels = loader.load_data()
    skf = StratifiedKFold(n_splits=config.kfold,
                          shuffle=True,
                          random_state=2017)

    for i, (train_idx, vali_idx) in enumerate(
            skf.split(train_data, np.zeros((len(labels), ))), 1):

        train_x, train_y = train_data[train_idx], labels[
            train_idx]  # 交叉验证训练集部分
        train_x, train_y = over_sampling(train_x, train_y,
                                         config.pos_nag_rato)  # 数据过采样到要求比例

        vali_x, vali_y = train_data[vali_idx], labels[vali_idx]  # 交叉验证验证集部分
        # vali_x, vali_y = over_sampling(vali_x,vali_y,config.pos_nag_rato) # 验证集不用过采样时请将此行注释

        model = MedlatModel("./cache/models/cnn_model_%d.h5" % (i))

        train_x = loader.get_inputs_set(train_x)
        vali_x = loader.get_inputs_set(vali_x)
        train_y = np.array([[0., 1.] if train_y[x] == 1.0 else [1., 0.]
                            for x in range(len(train_y))])
        vali_y = np.array([[0., 1.] if vali_y[x] == 1.0 else [1., 0.]
                           for x in range(len(vali_y))])

        model.train_model(train_x, train_y, vali_x, vali_y)
        data_y, pred = model.evaluate(vali_x, vali_y)
        if i == 1:
            all_pred_y = pred
            all_real_y = data_y
        else:
            all_pred_y = np.concatenate((all_pred_y, pred), axis=0)
            all_real_y = np.concatenate((all_real_y, data_y), axis=0)
        print "*" * 20 + "  %d fold end  " % (i) + "*" * 20
        # if i >= 1: break #交叉验证时请注释此行

    ######################### 总评估方法 #########################
    roc_df = pd.DataFrame(all_pred_y, columns=["prob_0", "prob_1"])
    roc_df["real"] = all_real_y
    roc_df.to_csv(config.evaluate_file, index=False, index_label=False)
    print("%s had saved. Please download it." % (config.evaluate_file))
    all_pred_y = np.argmax(all_pred_y, axis=1)
    confuse_matrix = metrics.confusion_matrix(all_real_y, all_pred_y)
    TN, FP, FN, TP = confuse_matrix.ravel()
    print "confusion matrix: \n", confuse_matrix
    print "accuracy score: ", metrics.accuracy_score(all_real_y, all_pred_y)
    print "classification report:\n", metrics.classification_report(
        all_real_y, all_pred_y)
    print "F1 score: ", metrics.f1_score(all_real_y, all_pred_y)
    print "recall: ", 1. * TP / (TP + FN)
    print "False positive rate: ", 1. * FP / (FP + TN)

Example #4

0

Show file

def cluster_Insurance(x, num_cluster):
  """Clusters and returns the data clustered into different environments.

  Args:
   x: the data to be clustered
   num_cluster: the number of clusters

  Returns:
    np.array: the data clustered into different environments
  """
  _, d = x.shape
  kmeans = KMeans(n_clusters=num_cluster, max_iter=1000).fit(x)
  labeled_x = kmeans.fit_predict(x)
  x_cluster = utils.classify_x(x, labeled_x)  # X_cluster is a dict
  x_upsampled, y_upsampled = utils.over_sampling(x, labeled_x)
  x_envs = utils.classify_x(x_upsampled, y_upsampled)

  return x_envs

Example #5

0

Show file

def main():
    path, grid_obs, start, goal = phi_star_gen()

    path = np.array(over_sampling([p.pos for p in path], max_length=1))
    # path[5, :] = np.array([2, 5])
    # path[13, :] = np.array([5, 17])
    # path[20, :] = np.array([15, 2])

    # path = np.array([[start[0], start[1]], [goal[0], goal[1]]])
    # path = over_sampling(path, max_length=1)

    distObs = euclideanDistanceTransform(grid_obs)

    pathOptimized = optimTrajectory(path, distObs, grid_obs, trajDuration=10)

    print(path, pathOptimized)

    smoothed = bsplineUpsample(pathOptimized)

Example #6

0

Show file

def read_sachs_to_envs(folder_path, num_envs, normalization):
  """Loads Sachs data and return sachs data divided into different environments.

  Args:
    folder_path:
    num_envs: the number of envs to read the data
    normalization: normalization type

  Returns:
    A tensor with shape [num_envs, number of sample in each envs, num of
    features]
  """
  sachs_data = list()

  if num_envs == 14:

    y_label = []
    for i, file in enumerate(glob.glob(f'{folder_path}*.xls')):
      sachs_df = pd.read_excel(file)
      sachs_array = sachs_df.to_numpy()
      sachs_array = utils.preprocess(sachs_array, normalization)
      sachs_data.append(sachs_array)
      y_label.append(np.ones(sachs_array.shape[0]) * i)

    sachs_data_envs = np.vstack(sachs_data)
    sachs_data_labels = np.hstack(y_label)
    X_res, Y_res = utils.over_sampling(sachs_data_envs, sachs_data_labels)
    X_cluster = utils.classify_x(X_res, Y_res)
    X_envs = utils.distribute_to_envs(X_cluster)

  elif num_envs == 2:

    X_envs = [None] * num_envs
    y_label = [None] * num_envs
    for i, file in enumerate(glob.glob(f'{folder_path}*.xls')):
      start_index = file.index('sachs_data/') + 11
      end_index = file.index(' ') - 1
      file_index = int(file[start_index:end_index])
      label = 0 if file_index <= 9 else 1

      sachs_df = pd.read_excel(file)
      sachs_array = sachs_df.to_numpy()
      if X_envs[label] is None:
        X_envs[label] = sachs_array
        y_label[label] = np.ones(sachs_array.shape[0]) * label
      else:
        X_envs[label] = np.concatenate((X_envs[label], sachs_array), axis=0)
        y_label[label] = np.concatenate(
            (y_label[label], (np.ones(sachs_array.shape[0]) * label)), axis=0)

    for i in range(num_envs):
      X_envs[i], y_label[i] = utils.preprocess_labeled_data(
          X_envs[i], y_label[i], normalization)

    X = np.vstack(X_envs)
    Y = np.hstack(y_label)
    X_res, Y_res = utils.over_sampling(X, Y)
    X_cluster = utils.classify_x(X_res, Y_res)
    X_envs = utils.distribute_to_envs(X_cluster)

  elif num_envs == 3:

    if not os.path.exists(f'./data/cluster/*3.csv'):
      X_envs = preprocess_sachs(folder_path, f'./data/cluster/', 3,
                                normalization)
    else:
      for file in glob.glob(f'./data/cluster/*3.csv'):
        sachs_array = np.loadtxt(file, delimiter=',')
        # sachs_array = preprocess(sachs_array, args=args)
        sachs_data.append(sachs_array)
        X_envs = np.stack(sachs_data)

  elif num_envs == 6:

    if not os.path.exists(f'./data/cluster/*3.csv'):
      X_envs = preprocess_sachs(folder_path, f'./data/cluster/', 6,
                                normalization)
    else:
      for file in glob.glob(f'./data/cluster/*6.csv'):
        sachs_array = np.loadtxt(file, delimiter=',')
        sachs_array = utils.preprocess(sachs_array, normalization)
        sachs_data.append(sachs_array)
        X_envs = np.stack(sachs_data)

  elif num_envs == 7:

    X_envs = [None] * num_envs
    y_label = [None] * num_envs
    for i, file in enumerate(glob.glob(f'{folder_path}*.xls')):
      start_index = file.index('sachs_data/') + 11
      end_index = file.index(' ') - 1
      file_index = int(file[start_index:end_index])
      label = file_index % num_envs

      sachs_df = pd.read_excel(file)
      sachs_array = sachs_df.to_numpy()

      if X_envs[label] is None:
        X_envs[label] = sachs_array
        y_label[label] = np.ones(sachs_array.shape[0]) * label
      else:
        X_envs[label] = np.concatenate((X_envs[label], sachs_array), axis=0)
        y_label[label] = np.concatenate(
            (y_label[label], (np.ones(sachs_array.shape[0]) * label)), axis=0)

    for i in range(num_envs):
      X_envs[i], y_label[i] = utils.preprocess_labeled_data(
          X_envs[i], y_label[i], normalization)

    X = np.vstack(X_envs)
    Y = np.hstack(y_label)
    X_res, Y_res = utils.over_sampling(X, Y)
    X_cluster = utils.classify_x(X_res, Y_res)
    X_envs = utils.distribute_to_envs(X_cluster)

  return X_envs