def densration_gridsearch(self, X_ref, X_test): lambda_range = 10**np.linspace( -3, 3, 7) # np.array([10**-3, 10**-2, 10**-1, 10**0, 10**1]) sigma_range = 10**np.linspace( -3, 3, 25 ) # np.array([10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**3]) estimator_1 = densratio(X_ref, X_test, self.alpha, sigma_range, lambda_range, self.kernel_num, verbose=False) estimator_2 = densratio(X_test, X_ref, self.alpha, sigma_range, lambda_range, self.kernel_num, verbose=False) w1_ref = estimator_1.compute_density_ratio(X_ref) w2_test = estimator_2.compute_density_ratio(X_test) score_max = (0.5 * np.mean(w1_ref) - 0.5) + (0.5 * np.mean(w2_test) - 0.5) return score_max
def estimate_hyperparameters(packed_sequence, window_size=50, alpha=0.1, sigma_range=None, lambda_range=None, num_samples=50, num_rank=2): sigmas_forward = np.zeros(num_samples) sigmas_backward = np.zeros(num_samples) lambdas_forward = np.zeros(num_samples) lambdas_backward = np.zeros(num_samples) print 'Sampling for hyperparameter estimation:...' for iteration in range(num_samples): i = np.random.randint(low=window_size, high=packed_sequence.shape[0] - window_size) backward_window = packed_sequence[i - window_size:i] forward_window = packed_sequence[i:i + window_size] ratio_forward_obj = densratio(backward_window, forward_window, alpha=alpha, sigma_range=sigma_range, lambda_range=lambda_range, verbose=False) ratio_backward_obj = densratio(forward_window, backward_window, alpha=alpha, sigma_range=sigma_range, lambda_range=lambda_range, verbose=False) sigmas_forward[iteration] = ratio_forward_obj.kernel_info.sigma lambdas_forward[iteration] = ratio_forward_obj.lambda_ sigmas_backward[iteration] = ratio_backward_obj.kernel_info.sigma lambdas_backward[iteration] = ratio_backward_obj.lambda_ print 'Iteration', iteration, 'complete.' print 'Sampling for hyperparameter estimation complete.' return get_top_counts(sigmas_forward, num_rank), \ get_top_counts(sigmas_backward, num_rank), \ get_top_counts(lambdas_forward, num_rank), \ get_top_counts(lambdas_backward, num_rank)
def rulsif(ts, window_size=50, threshold=.1): from densratio import densratio def make_window(arr, win, jump): return np.array( [arr[i:i + win] for i in range(0, len(arr) - win + 1, jump)]) ts = (ts - ts.min(axis=0)) / (ts.max(axis=0) - ts.min(axis=0)) all_ratios = [] rolling_window = np.array(make_window(ts, window_size, window_size)) for win1, win2 in zip(rolling_window[:-1], rolling_window[1:]): concat = np.concatenate([win1, win2]) med = np.nanmedian(concat) sigma_list = med * np.array([.6, .8, 1.0, 1.2, 1.4]) lambda_list = [10e-3, 10e-2, 10e-1, 1, 10] ratio = densratio(win1, win2, alpha=0.01, lambda_range=lambda_list, sigma_range=sigma_list, verbose=False) all_ratios.append(ratio.alpha_PE) preds = _find_bp_rulsif(ts, all_ratios, threshold, window_size) return (preds.tolist(), None)
def __init__(self, args): super().__init__(1, "MSSynthetic1D") np.random.seed(args.seed) self.num_sources = args.num_sources self.num_data = args.num_data source_mus = (np.random.rand(self.num_sources) * (args.source_mu_bound * 2.0)) - args.source_mu_bound target_mu = (np.random.rand() * args.target_mu_bound * 2.0) - args.target_mu_bound self.mus = np.append(source_mus, target_mu) assert len(self.mus) == self.num_sources + 1 self.sigma = args.sigma self.estimator_type = args.estimator_type self.source_ind = np.random.randint(0, self.num_sources) # used for naive self.sources = [ create_one_task(self.mus[s], args.coef_a, args.coef_b, self.sigma, args.num_data) for s in range(self.num_sources) ] self.target = create_one_task( self.mus[self.num_sources], args.coef_a, args.coef_b, args.sigma, args.num_data, ) print(f"target_mu:{target_mu}, source_mus:{source_mus}") # for unbiased or vr self.density_ratios = None if self.estimator_type == "unbiased" or self.estimator_type == "vr": hp_search_range = [0.1] if args.debug else [0.001, 0.01, 0.1, 1.0] self.density_ratios = [ densratio( self.target["X"], self.sources[s]["X"], alpha=0.0, sigma_range=hp_search_range, lambda_range=hp_search_range, ) for s in range(self.num_sources) ] # check consistency for mu and lambda if self.estimator_type == "vr": self.log = dict() self.log["target_mu"] = target_mu self.log["source1_mu"] = self.mus[0] self.log["source2_mu"] = self.mus[1] self.log["abs_source1_mu"] = abs(target_mu - self.mus[0]) self.log["abs_source2_mu"] = abs(target_mu - self.mus[1]) self.log["lambda1"] = [] self.log["lambda2"] = [] else: self.log = None
def main(filename, dataset_filename, alpha, seed): (X_train, y_train), (X_test, y_test) = load_hdf5(dataset_filename) densratio_obj = densratio(X_test, X_train, alpha=alpha) importance_weights = densratio_obj.compute_density_ratio(X_train) with h5py.File(filename, 'w') as f: f.create_dataset("importance_weights", data=importance_weights) return 0
def density_ratio_estimation(train_data, test_data): result = densratio(train_data, test_data) sample_weight = result.compute_density_ratio(train_data) return sample_weight
def experiment(): percent_p = 3 path = r'C:\Users\yyveggie\Desktop\UCI\Conversion\mushroom.csv' seed = 2019 est_error_upu = [] est_binary_upu = [] est_binary_pusb = [] est_error_pusb = [] est_error_drsb = [] est_binary_drsb = [] for k in range(10): np.random.seed(seed) pi = 0.6 # 类先验,表示U中P所占比例,现在的问题是,是否需要根据下文中比例进行计算,而不是提前随机指定 classifier = LogisticRegression(C=0.01, penalty='l2', solver='liblinear') texts_1, texts_0 = CSV(path) texts_1 = np.array_split(texts_1, 10) # 将类别为1的样本集分成十份 texts_0 = np.array_split(texts_0, 10) # 将类别为0的样本集分成十份 x_test = np.array(list(texts_1[k]) + list(texts_0[k])) # 测试集x,每一份每轮选一次 t_test = np.array(list(len(texts_1[k]) * [1]) + list(len(texts_0[k]) * [0])) # 测试集y,正例为1,负例为0 index_rest = sorted(set(range(10)) - set([k])) # 除了测试集剩下的索引 texts_1 = np.array(texts_1) texts_0 = np.array(texts_0) texts_1 = np.array([j for i in texts_1[index_rest] for j in i]) # p texts_0 = np.array([j for i in texts_0[index_rest] for j in i]) # n x = np.vstack((texts_1, texts_0)) # p和n组成训练集 one = np.ones((len(x), 1)) # x_pn = np.concatenate([x, one], axis=1) t = pd.Series([1] * len(texts_1) + [0] * len(texts_0)) classifier.fit(x_pn, t) x_train = x t_train = t xp = x_train[t_train == 1] one = np.ones((len(xp), 1)) xp_temp = np.concatenate([xp, one], axis=1) xp_prob = classifier.predict_proba(xp_temp)[:, 1] # xp_prob /= np.mean(xp_prob) xp_prob = xp_prob ** 20 xp_prob /= np.max(xp_prob) rand = np.random.uniform(size=len(xp)) temp = xp[xp_prob > rand] pdata = int(percent_p / 10 * len(x)) # p样本数量,占了总数的3/10 while (len(temp) < pdata): rand = np.random.uniform(size=len(xp)) temp = np.concatenate([temp, xp[xp_prob > rand]], axis=0) xp = temp perm = np.random.permutation(len(xp)) xp = xp[perm[:pdata]] u = int(6 / 10 * len(x)) # u样本数量,占了总数的6/10 updata = np.int(u * pi) # U中P的数量 = U的数量 * 类先验 undata = u - updata # U中N的数量 = U的数量 - U中P的数量 xp_temp = x_train[t_train == 1] xn_temp = x_train[t_train == 0] perm = np.random.permutation(len(xp_temp)) xp_temp = xp_temp[perm[:updata]] perm = np.random.permutation(len(xn_temp)) xn_temp = xn_temp[perm[:undata]] xu = np.concatenate([xp_temp, xn_temp], axis=0) x = np.concatenate([xp, xu], axis=0) tp = np.ones(len(xp)) tu = np.zeros(len(xu)) t = np.concatenate([tp, tu], axis=0) updata = np.int(1000 * pi) undata = 1000 - updata xp_test = x_test[t_test == 1] perm = np.random.permutation(len(xp_test)) xp_test = xp_test[perm[:updata]] xn_test = x_test[t_test == 0] perm = np.random.permutation(len(xn_test)) xn_test = xn_test[perm[:undata]] x_test = np.concatenate([xp_test, xn_test], axis=0) tp = np.ones(len(xp_test)) tu = np.zeros(len(xn_test)) t_test = np.concatenate([tp, tu], axis=0) pu = PU(pi=pi) x_train = x res, x_test_kernel = pu.optimize(x, t, x_test) acc1, f1_binary1 = pu.test(x_test_kernel, res, t_test, quant=False) acc2, f1_binary2 = pu.test(x_test_kernel, res, t_test, quant=True, pi=pi) result = densratio(x_train[t == 1], x_train[t == 0]) r = result.compute_density_ratio(x_test) temp = np.copy(r) temp = np.sort(temp) theta = temp[np.int(np.floor(len(x_test) * (1 - pi)))] pred = np.zeros(len(x_test)) pred[r > theta] = 1 acc3 = np.mean(pred == t_test) f1_binary3 = f1_score(t_test, pred, average='binary') est_error_upu.append(acc1) est_binary_upu.append(f1_binary1) est_error_pusb.append(acc2) est_binary_pusb.append(f1_binary2) est_error_drsb.append(acc3) est_binary_drsb.append(f1_binary3) seed += 1 print("Iter:", k) print("upu_accuracy ", acc1) print("upu_f1_binary ", f1_binary1) print("pusb_accuracy ", acc2) print("pusb_f1_binary ", f1_binary2) print("drsb_accuracy ", acc3) print("drsb_f1_binary ", f1_binary3) print("Accuracy for uPU:", np.mean(est_error_upu)) print("F1-Score for uPU:", np.mean(est_binary_upu)) print("Accuracy for PUSB:", np.mean(est_error_pusb)) print("F1-Score for PUSB:", np.mean(est_binary_pusb)) print("Accuracy for DRSB:", np.mean(est_error_drsb)) print("F1-Score for DRSB:", np.mean(est_binary_drsb))
dim_image=1 else: dataset = np.load('../input_data/cifar_dataset.npz') size_image=32 dim_image=3 #size_image=28 #dim_image=1 Xtr = dataset ['Xtr'].astype('float64') Str = dataset ['Str'].ravel() eps=np.finfo(np.float64).eps Xtr2=Xtr/255 Xtr2[Xtr2==0]=eps Xtr2[Xtr2==1]=1-eps Xtr3=np.log(Xtr2/(1-Xtr2)) Xtr=Xtr3 PY1=sum(Str)/Str.shape PY0=1-PY1 XY1=Xtr[Str==1,:] XY0=Xtr[Str==0,:] XY1oX=densratio(XY1,Xtr) XY1oXV=min(XY1oX.compute_density_ratio(Xtr)) Y1X=XY1oXV*PY1 print(Y1X) XY0oX=densratio(XY0,Xtr) XY0oXV=min(XY1oX.compute_density_ratio(Xtr)) Y0X=XY0oXV*PY0 print(Y0X)
def __init__(self, args): super().__init__(dim=args.dim, name="Parkinson") # check requirements assert args.target_name is not None, "target_name is None." if is_our_estimator(args.estimator_type): assert args.ratio_dre is not None, "DRE ratio is None." assert (args.is_separate_source_dens is not None), "is_separate_source_dens is None." np.random.seed(args.seed) self.space = None self.estimator_type = args.estimator_type ( self.sources_density, self.sources_train, self.sources_val, self.target_opt, self.target_val, ) = load_dataset( self.estimator_type, args.data_dir, args.target_name, args.ratio_validation, args.ratio_dre, args.is_separate_source_dens, ) self.source_num = len(self.sources_train) if self.estimator_type == "naive": self.source_naive_ind = np.random.randint(0, len(self.sources_train)) self.source_name_naive = self.sources_train[ self.source_naive_ind]["filename"] self.is_source_concat_for_naive = args.is_source_concat_for_naive else: self.source_naive_ind = None self.source_name_naive = None self.is_source_concat_for_naive = None self.density_ratios = None if is_our_estimator(args.estimator_type): if args.debug: self.density_ratios = [ densratio( self.target_opt["X"], self.sources_density[s]["X"], alpha=args.alpha, sigma_range=[1.0], lambda_range=[0.001], ) for s in range(len(self.sources_density)) ] else: hp_search_range = [1e-3, 1e-2, 1e-1, 1e-0] self.density_ratios = [ densratio( self.target_opt["X"], self.sources_density[s]["X"], alpha=args.alpha, sigma_range=hp_search_range, lambda_range=hp_search_range, ) for s in range(len(self.sources_density)) ] self.all_source_train_data = self.concat_all_sources_train() self.all_source_val_data = self.concat_all_sources_val() self.model = get_model(args.model)
def experiment(datatype, udata): priors = [0.2, 0.4, 0.6, 0.8] ite = 100 pdata = 400 num_basis = 300 seed = 2018 est_error_pu = np.zeros((len(udata), len(priors), ite)) est_error_pubp = np.zeros((len(udata), len(priors), ite)) est_error_dr = np.zeros((len(udata), len(priors), ite)) for i in range(len(udata)): u = udata[i] for j in range(len(priors)): pi = priors[j] for k in range(ite): np.random.seed(seed) #PN classification x, t = make_data(datatype=datatype) x = x / np.max(x, axis=0) one = np.ones((len(x), 1)) x_pn = np.concatenate([x, one], axis=1) classifier = LogisticRegression(C=0.01, penalty='l2') classifier.fit(x_pn, t) perm = np.random.permutation(len(x)) x_train = x[perm[:-3000]] t_train = t[perm[:-3000]] x_test = x[perm[-3000:]] t_test = t[perm[-3000:]] xp = x_train[t_train == 1] one = np.ones((len(xp), 1)) xp_temp = np.concatenate([xp, one], axis=1) xp_prob = classifier.predict_proba(xp_temp)[:, 1] #xp_prob /= np.mean(xp_prob) xp_prob = xp_prob**20 xp_prob /= np.max(xp_prob) rand = np.random.uniform(size=len(xp)) temp = xp[xp_prob > rand] while (len(temp) < pdata): rand = np.random.uniform(size=len(xp)) temp = np.concatenate([temp, xp[xp_prob > rand]], axis=0) xp = temp perm = np.random.permutation(len(xp)) xp = xp[perm[:pdata]] updata = np.int(u * pi) undata = u - updata xp_temp = x_train[t_train == 1] xn_temp = x_train[t_train == 0] perm = np.random.permutation(len(xp_temp)) xp_temp = xp_temp[perm[:updata]] perm = np.random.permutation(len(xn_temp)) xn_temp = xn_temp[perm[:undata]] xu = np.concatenate([xp_temp, xn_temp], axis=0) x = np.concatenate([xp, xu], axis=0) tp = np.ones(len(xp)) tu = np.zeros(len(xu)) t = np.concatenate([tp, tu], axis=0) updata = np.int(1000 * pi) undata = 1000 - updata xp_test = x_test[t_test == 1] perm = np.random.permutation(len(xp_test)) xp_test = xp_test[perm[:updata]] xn_test = x_test[t_test == 0] perm = np.random.permutation(len(xn_test)) xn_test = xn_test[perm[:undata]] x_test = np.concatenate([xp_test, xn_test], axis=0) tp = np.ones(len(xp_test)) tu = np.zeros(len(xn_test)) t_test = np.concatenate([tp, tu], axis=0) pu = PU(pi=pi) x_train = x res, x_test_kernel = pu.optimize(x, t, x_test) acc1 = pu.test(x_test_kernel, res, t_test, quant=False) acc2 = pu.test(x_test_kernel, res, t_test, quant=True, pi=pi) result = densratio(x_train[t == 1], x_train[t == 0]) r = result.compute_density_ratio(x_test) temp = np.copy(r) temp = np.sort(temp) theta = temp[np.int(np.floor(len(x_test) * (1 - pi)))] pred = np.zeros(len(x_test)) pred[r > theta] = 1 acc3 = np.mean(pred == t_test) est_error_pu[i, j, k] = acc1 est_error_pubp[i, j, k] = acc2 est_error_dr[i, j, k] = acc3 seed += 1 print(acc1) print(acc2) print(acc3) est_error_pu_mean = np.mean(est_error_pu, axis=2) est_error_pubp_mean = np.mean(est_error_pubp, axis=2) est_error_dr_mean = np.mean(est_error_dr, axis=2) est_error_pu_std = np.std(est_error_pu, axis=2) est_error_pubp_std = np.std(est_error_pubp, axis=2) est_error_dr_std = np.std(est_error_dr, axis=2) return est_error_pu_mean, est_error_pubp_mean, est_error_pu_std, est_error_pubp_std, est_error_dr_mean, est_error_dr_std
def fit(self, X_top, X_bot, *args, **kwargs): self.densratio_obj = densratio(X_top, X_bot, alpha=self.alpha)
def main(): ite = 10 num_train_data = 2000 num_test_data = 2000 Net = NN model_num = 3 learning_rate = 1e-4 epoch = 200 batchsize = 256 seed = 2020 for f_name_idx0 in range(len(file_names)): for f_name_idx1 in range(f_name_idx0 + 1, len(file_names)): train_loss_normal = np.zeros((ite, model_num)) test_loss_normal = np.zeros((ite, model_num)) auc_normal = np.zeros((ite, model_num)) train_loss_kerulsif = np.zeros((ite, model_num)) test_loss_kerulsif = np.zeros((ite, model_num)) auc_kerulsif = np.zeros((ite, model_num)) train_loss_kerkleip = np.zeros((ite, model_num)) test_loss_kerkleip = np.zeros((ite, model_num)) auc_kerkleip = np.zeros((ite, model_num)) train_loss_pu = np.zeros((ite, model_num)) test_loss_pu = np.zeros((ite, model_num)) auc_pu = np.zeros((ite, model_num)) train_loss_ulsif = np.zeros((ite, model_num)) test_loss_ulsif = np.zeros((ite, model_num)) auc_ulsif = np.zeros((ite, model_num)) train_loss_nnpu = np.zeros((ite, model_num)) test_loss_nnpu = np.zeros((ite, model_num)) auc_nnpu = np.zeros((ite, model_num)) train_loss_nnulsif = np.zeros((ite, model_num)) test_loss_nnulsif = np.zeros((ite, model_num)) auc_nnulsif = np.zeros((ite, model_num)) f_name0 = file_names[f_name_idx0] f_name1 = file_names[f_name_idx1] for i in range(ite): np.random.seed(seed) if f_name0 != f_name1: data0 = pd.read_csv('dataset/%s.csv' % f_name0) data1 = pd.read_csv('dataset/%s.csv' % f_name1) data0 = data0.dropna() data1 = data1.dropna() perm0 = np.random.permutation(len(data0)) perm1 = np.random.permutation(len(data1)) choice0 = np.zeros(len(data0)) choice0[perm0[:num_train_data]] = 1 data0['choice'] = choice0 choice1 = np.zeros(len(data1)) choice1[perm1[:num_test_data]] = 1 data1['choice'] = choice1 data0 = data0.get(['rating', 'text', 'item', 'choice']) data1 = data1.get(['rating', 'text', 'item', 'choice']) data = pd.concat([data0, data1]) else: data = pd.read_csv('dataset/%s.csv' % f_name0) data = data.dropna() perm = np.random.permutation(len(data)) choice = np.zeros(len(data)) choice[perm[:num_train_data + num_test_data]] = 1 data['choice'] = choice print('N: ', len(data)) text_data = data.text.values vectorizer = TfidfVectorizer(max_features=10000, min_df=0.0, max_df=0.8) #vectorizer = TfidfVectorizer(min_df=0.0, max_df=0.8) text_list_vec = vectorizer.fit_transform(text_data) #X = text_list_vec[data['choice'].values == 1].toarray() X = text_list_vec[data['choice'].values == 1].toarray() print(X.shape) pca = PCA(n_components=100) pca.fit(X) X_pca = pca.transform(X) rating0 = data[data['choice'].values == 1].rating.values[:num_train_data] rating1 = data[data['choice'].values == 1].rating.values[num_train_data:] X0 = X[:num_train_data] X1 = X[num_train_data:] X_pca0 = X_pca[:num_train_data] X_pca1 = X_pca[num_train_data:] result = densratio( X_pca0, X_pca1, sigma_range=[0.01, 0.05, 0.1, 0.5, 1], lambda_range=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]) dr0 = result.compute_density_ratio(X_pca0) kliep = DensityRatioEstimator() kliep.fit(X_pca0, X_pca1) #dr1 = np.ones(len(X_pca0)) dr1 = kliep.predict(X_pca0) dim = X0.shape[1] device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') #device = 'cpu' model = Net(dim).to(device) optimizer = optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=1e-5) model = train(X0, X1, epoch, model, optimizer, device, batchsize=batchsize, method='PU') dr2 = test(X0, model, device, batchsize=100, method='PU') model = Net(dim).to(device) optimizer = optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=1e-5) model = train(X0, X1, epoch, model, optimizer, device, batchsize=batchsize, method='uLSIF') dr3 = test(X0, model, device, batchsize=100, method='uLSIF') model = Net(dim).to(device) optimizer = optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=1e-5) model = train(X0, X1, epoch, model, optimizer, device, batchsize=batchsize, method='nnPU') dr4 = test(X0, model, device, batchsize=100, method='PU') model = Net(dim).to(device) optimizer = optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=1e-5) model = train(X0, X1, epoch, model, optimizer, device, batchsize=batchsize, method='nnuLSIF') dr5 = test(X0, model, device, batchsize=100, method='uLSIF') dr3[dr3 < 0] = 0. dr5[dr5 < 0] = 0. dr0[~((dr0 > 0) & (dr0 < 100))] = 100 dr1[~((dr1 > 0) & (dr1 < 100))] = 100 dr2[~((dr2 > 0) & (dr2 < 100))] = 100 dr3[~((dr3 > 0) & (dr3 < 100))] = 100 dr4[~((dr4 > 0) & (dr4 < 100))] = 100 dr5[~((dr5 > 0) & (dr5 < 100))] = 100 print(dr3) print(dr4) print(dr5) print('meandr4', np.mean(dr4)) print('meandr5', np.mean(dr5)) reg = Ridge() reg = GridSearchCV(reg, {'alpha': [0.0001, 0.001, 0.01, 0.1, 1]}, cv=5) idx_model = 0 x_train = X_pca0 x_test = X_pca1 train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=None) train_loss_normal[i, idx_model] = train_loss test_loss_normal[i, idx_model] = test_loss auc_normal[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr0) train_loss_kerulsif[i, idx_model] = train_loss test_loss_kerulsif[i, idx_model] = test_loss auc_kerulsif[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr1) train_loss_kerkleip[i, idx_model] = train_loss test_loss_kerkleip[i, idx_model] = test_loss auc_kerkleip[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr2) train_loss_pu[i, idx_model] = train_loss test_loss_pu[i, idx_model] = test_loss auc_pu[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr3) train_loss_ulsif[i, idx_model] = train_loss test_loss_ulsif[i, idx_model] = test_loss auc_ulsif[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr4) train_loss_nnpu[i, idx_model] = train_loss test_loss_nnpu[i, idx_model] = test_loss auc_nnpu[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr5) train_loss_nnulsif[i, idx_model] = train_loss test_loss_nnulsif[i, idx_model] = test_loss auc_nnulsif[i, idx_model] = auc print('0:normal', test_loss_normal) print('0:nnulsif', test_loss_nnulsif) print('0:nnpu', test_loss_nnpu) print('0:normal', auc_normal) print('0:nnulsif', auc_nnulsif) print('0:nnpu', auc_nnpu) #reg = KernelRidge(alpha=1, kernel='rbf', gamma=0.1) #reg = KernelRidge(alpha=0.1, kernel='rbf', gamma=1) reg = KernelRidge() reg = GridSearchCV(reg, { 'kernel': ['rbf'], 'alpha': [0.0001, 0.001, 0.01, 0.1, 1], 'gamma': [0.001, 0.01, 0.1, 1] }, cv=5) idx_model = 1 x_train = X_pca0 x_test = X_pca1 train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=None) train_loss_normal[i, idx_model] = train_loss test_loss_normal[i, idx_model] = test_loss auc_normal[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr0) train_loss_kerulsif[i, idx_model] = train_loss test_loss_kerulsif[i, idx_model] = test_loss auc_kerulsif[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr1) train_loss_kerkleip[i, idx_model] = train_loss test_loss_kerkleip[i, idx_model] = test_loss auc_kerkleip[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr2) train_loss_pu[i, idx_model] = train_loss test_loss_pu[i, idx_model] = test_loss auc_pu[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr3) train_loss_ulsif[i, idx_model] = train_loss test_loss_ulsif[i, idx_model] = test_loss auc_ulsif[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr4) train_loss_nnpu[i, idx_model] = train_loss test_loss_nnpu[i, idx_model] = test_loss auc_nnpu[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr5) train_loss_nnulsif[i, idx_model] = train_loss test_loss_nnulsif[i, idx_model] = test_loss auc_nnulsif[i, idx_model] = auc print('1:normal', test_loss_normal) print('1:nnulsif', test_loss_nnulsif) ''' reg = KernelRidge() reg = GridSearchCV(reg, {'kernel': ['polynomial'], 'alpha': [0.0001, 0.001, 0.01, 0.1, 1], 'gamma': [2, 3, 4, 5]}, cv=5) idx_model = 2 x_train = X0 x_test = X1 train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=None) train_loss_normal[i, idx_model] = train_loss test_loss_normal[i, idx_model] = test_loss auc_normal[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr0) train_loss_kerulsif[i, idx_model] = train_loss test_loss_kerulsif[i, idx_model] = test_loss auc_kerulsif[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr1) train_loss_kerkleip[i, idx_model] = train_loss test_loss_kerkleip[i, idx_model] = test_loss auc_kerkleip[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr2) train_loss_pu[i, idx_model] = train_loss test_loss_pu[i, idx_model] = test_loss auc_pu[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr3) train_loss_ulsif[i, idx_model] = train_loss test_loss_ulsif[i, idx_model] = test_loss auc_ulsif[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr4) train_loss_nnpu[i, idx_model] = train_loss test_loss_nnpu[i, idx_model] = test_loss auc_nnpu[i, idx_model] = auc train_loss, test_loss, auc = calc_result(reg, x_train, rating0, x_test, rating1, dr=dr5) train_loss_nnulsif[i, idx_model] = train_loss test_loss_nnulsif[i, idx_model] = test_loss auc_nnulsif[i, idx_model] = auc ''' seed += 1 np.savetxt('results/train_loss_normal_%s_%s.csv' % (f_name0, f_name1), train_loss_normal, delimiter=',') np.savetxt('results/test_loss_normal_%s_%s.csv' % (f_name0, f_name1), test_loss_normal, delimiter=',') np.savetxt('results/auc_normal_%s_%s.csv' % (f_name0, f_name1), auc_normal, delimiter=',') np.savetxt('results/train_loss_kerulsif_%s_%s.csv' % (f_name0, f_name1), train_loss_kerulsif, delimiter=',') np.savetxt('results/test_loss_kerulsif_%s_%s.csv' % (f_name0, f_name1), test_loss_kerulsif, delimiter=',') np.savetxt('results/auc_kerulsif_%s_%s.csv' % (f_name0, f_name1), auc_kerulsif, delimiter=',') np.savetxt('results/train_loss_kerkleip_%s_%s.csv' % (f_name0, f_name1), train_loss_kerkleip, delimiter=',') np.savetxt('results/test_loss_kerkleip_%s_%s.csv' % (f_name0, f_name1), test_loss_kerkleip, delimiter=',') np.savetxt('results/auc_kerkleip_%s_%s.csv' % (f_name0, f_name1), auc_kerkleip, delimiter=',') np.savetxt('results/train_loss_pu_%s_%s.csv' % (f_name0, f_name1), train_loss_pu, delimiter=',') np.savetxt('results/test_loss_pu_%s_%s.csv' % (f_name0, f_name1), test_loss_pu, delimiter=',') np.savetxt('results/auc_pu_%s_%s.csv' % (f_name0, f_name1), auc_pu, delimiter=',') np.savetxt('results/train_loss_ulsif_%s_%s.csv' % (f_name0, f_name1), train_loss_ulsif, delimiter=',') np.savetxt('results/test_loss_ulsif_%s_%s.csv' % (f_name0, f_name1), test_loss_ulsif, delimiter=',') np.savetxt('results/auc_ulsif_%s_%s.csv' % (f_name0, f_name1), auc_ulsif, delimiter=',') np.savetxt('results/train_loss_nnpu_%s_%s.csv' % (f_name0, f_name1), train_loss_nnpu, delimiter=',') np.savetxt('results/test_loss_nnpu_%s_%s.csv' % (f_name0, f_name1), test_loss_nnpu, delimiter=',') np.savetxt('results/auc_nnpu_%s_%s.csv' % (f_name0, f_name1), auc_nnpu, delimiter=',') np.savetxt('results/train_loss_nnulsif_%s_%s.csv' % (f_name0, f_name1), train_loss_nnulsif, delimiter=',') np.savetxt('results/test_loss_nnulsif_%s_%s.csv' % (f_name0, f_name1), test_loss_nnulsif, delimiter=',') np.savetxt('results/auc_nnulsif_%s_%s.csv' % (f_name0, f_name1), auc_nnulsif, delimiter=',')
def rulsif_analysis(input_data, k, n, perform_hyperparameter_estimation, alpha, anomaly_type='change_points', transform='no_transform'): # Unpack input. counts, energy_range, times = input_data # Transform counts if required. counts = Transformation(transform).transform(counts) # Pack sequence into blocks. counts_packed = pack(counts, k) # Median distance between subsequences. dmed = get_median_pairwise_distance(counts_packed) # Range of values the hyperparameters were supposed to take, according to the reference. sigma_range = np.array( [0.6 * dmed, 0.8 * dmed, 1.0 * dmed, 1.2 * dmed, 1.4 * dmed]) sigma_forward_range = sigma_backward_range = sigma_range lambda_range = np.array([1e-3, 1e-2, 1e-1, 1e0, 1e1]) lambda_forward_range = lambda_backward_range = lambda_range # Restrict range further by taking the most common hyperparameters, # selected by fitting random samples. if perform_hyperparameter_estimation: sigma_forward_range, sigma_backward_range, lambda_forward_range, lambda_backward_range = \ estimate_hyperparameters(counts_packed, window_size=n, sigma_range=sigma_range, lambda_range=lambda_range, alpha=alpha, num_rank=2) # Change-point scores. packed_sequence_size = counts_packed.shape[0] original_sequence_size = counts.shape[0] scores = np.zeros(original_sequence_size) # Sliding-window over packed sequence. for i in range(n, packed_sequence_size - n + 1): forward_window = counts_packed[i:i + n] backward_window = counts_packed[i - n:i] forward_density_obj = densratio(backward_window, forward_window, alpha=alpha, sigma_range=sigma_forward_range, lambda_range=lambda_forward_range, verbose=False) forward_divergence = forward_density_obj.alpha_PE backward_density_obj = densratio(forward_window, backward_window, alpha=alpha, sigma_range=sigma_backward_range, lambda_range=lambda_backward_range, verbose=False) backward_divergence = backward_density_obj.alpha_PE change_point_score = forward_divergence + backward_divergence # Use larger range of hyperparameters if we can't get a good fit with the smaller one. if change_point_score < 0: sigma_range = np.array([ 0.7 * dmed, 0.8 * dmed, 0.9 * dmed, dmed, 1.1 * dmed, 1.2 * dmed, 1.3 * dmed ]) forward_density_obj = densratio(backward_window, forward_window, alpha=alpha, sigma_range=sigma_range, verbose=False) forward_divergence = forward_density_obj.alpha_PE backward_density_obj = densratio(forward_window, backward_window, alpha=alpha, sigma_range=sigma_range, verbose=False) backward_divergence = backward_density_obj.alpha_PE change_point_score = forward_divergence + backward_divergence # Shift score ahead because of packing. scores[i + k // 2] = change_point_score # Cut off scores at 0, no negative values. scores[scores < 0] = 0 # Return a list of times and scores, for change-points. # Convert these to intervals for other anomalies. if anomaly_type == 'change_points': return times, scores elif anomaly_type in ['bimodality', 'negative_ions']: intervals = [] interval_scores = [] # Compute maximum position and indices. max_index = np.argmax(scores) max_score = np.max(scores) while max_score > 0: # Idea: Full-Width at Quarter-Maximum # Go towards the left. for start_index, score in reversed( list(enumerate(scores[:max_index]))): if score < max_score / 4: start_index += 1 break # Now towards the right. for end_index, score in enumerate(scores[max_index:], start=max_index): if score < max_score / 4: break # Add this as an interval. # The interval's score as the mean score of all points within. if start_index != end_index: intervals.append((times[start_index], times[end_index])) interval_scores.append(np.sum(scores[start_index:end_index])) # Mask these indices. scores[start_index:end_index] = -np.inf # Compute maximum position and indices. max_index = np.argmax(scores) max_score = np.max(scores) # Aggregating zero-scored timesteps into intervals. start_index = 0 while start_index < len(scores): if scores[start_index] == 0: for end_index, score in enumerate(scores[start_index:], start=start_index): if score == -np.inf: break intervals.append((times[start_index], times[end_index])) interval_scores.append(np.mean(scores[start_index:end_index])) start_index = end_index start_index += 1 return intervals, interval_scores
def main(els_data_file, output_file, perform_hyperparameter_estimation, load_from_file, save_to_file, quantity, start_time, end_time, run_tests, plot_processed_sequence, k, n): # Check input arguments - start and end times should be valid. if start_time is not None: try: start_time = datetime.strptime(start_time, '%d-%m-%Y/%H:%M') except ValueError: raise else: start_time = datetime.min if end_time is not None: try: end_time = datetime.strptime(end_time, '%d-%m-%Y/%H:%M').replace( second=59, microsecond=999999) except ValueError: raise else: end_time = datetime.max # Run doctests. if run_tests: import doctest import data_utils doctest.testmod(data_utils, verbose=True, optionflags=doctest.NORMALIZE_WHITESPACE) doctest.testmod(verbose=True, optionflags=doctest.NORMALIZE_WHITESPACE) # RuLSIF parameter. alpha = 0.1 # Set random seed for reproducibility. random_seed = 7 np.random.seed(random_seed) # Load processed sequence, if file found. els_sequence_file = os.path.splitext(els_data_file)[0] + '_RuLSIF_sequence' if load_from_file and os.path.exists(els_sequence_file + '.npz'): print 'Loading processed sequence from sequence file...' filedata = np.load(els_sequence_file + '.npz') counts_packed = filedata['counts_packed'] energy_range = filedata['energy_range'] times = filedata['times'] dmed = filedata['dmed'] else: print 'Sequence file not found. Extracting data from original ELS file and processing...' counts, energy_range, times = get_ELS_data(els_data_file, quantity, start_time, end_time) # import pdb; pdb.set_trace() # For debugging. # Process counts. # counts = gaussian_blur(counts, sigma=0.5) # counts = np.ma.log(counts) # See the sequence plotted (lineplot for 1D data, colourplot for 2D data). if plot_processed_sequence: print 'Plotting processed sequence...' fig, ax = plt.subplots(1, 1) ax.set_title('Processed Sequence') if len(counts.shape) == 1: ax.xaxis_date() ax.xaxis.set_major_formatter( mdates.DateFormatter('%d-%m-%Y/%H:%M')) fig.autofmt_xdate() ax.plot(times, counts) elif len(counts.shape) == 2: plt.imshow(counts.T, origin='lower', interpolation='none') ax.set_aspect('auto') plt.colorbar(ax=ax, orientation='vertical') plt.show() # Pack sequence into blocks. print 'Packing sequence into blocks...' counts_packed = pack(counts, k) print 'Sequence packed into shape %s.' % (counts_packed.shape, ) # Median distance between subsequences. print 'Computing median distance between packed samples...' dmed = get_median_pairwise_distance(counts_packed) print 'Median distance between packed samples, dmed =', dmed # Save values to file. if save_to_file: arrays_with_names = { 'counts_packed': counts_packed, 'energy_range': energy_range, 'times': times, 'dmed': np.array(dmed) } np.savez(els_sequence_file, **arrays_with_names) # Range of values the hyperparameters were supposed to take, according to the reference. sigma_range = np.array([dmed]) sigma_forward_range = sigma_backward_range = sigma_range lambda_range = np.array([1e-3, 1e-2, 1e-1, 1e0, 1e1]) lambda_forward_range = lambda_backward_range = lambda_range # Restrict range further by taking the most common hyperparameters selected for fitting random samples. if perform_hyperparameter_estimation: els_hyperparameters_file = os.path.splitext( els_data_file)[0] + '_RuLSIF_hyperparameters' if load_from_file and os.path.exists(els_hyperparameters_file + '.npz'): print 'Hyperparameters file found. Loading from file...' filedata = np.load(els_hyperparameters_file + '.npz') sigma_forward_range = filedata['sigma_forward_range'] sigma_backward_range = filedata['sigma_backward_range'] lambda_forward_range = filedata['lambda_forward_range'] lambda_backward_range = filedata['lambda_backward_range'] else: print 'Hyperparameters file not found. Performing estimation...' sigma_forward_range, sigma_backward_range, \ lambda_forward_range, lambda_backward_range = \ estimate_hyperparameters(counts_packed, window_size=n, sigma_range=sigma_range, lambda_range=lambda_range, alpha=alpha, num_rank=2) if save_to_file: arrays_with_names = { 'sigma_forward_range': sigma_forward_range, 'sigma_backward_range': sigma_backward_range, 'lambda_forward_range': lambda_forward_range, 'lambda_backward_range': lambda_backward_range } np.savez(els_hyperparameters_file, **arrays_with_names) print 'Hyperparameters will be selected from the ranges:' print 'sigma_forward_range =', sigma_forward_range print 'sigma_backward_range =', sigma_backward_range print 'lambda_forward_range =', lambda_forward_range print 'lambda_backward_range =', lambda_backward_range # Change-point scores. packed_sequence_size = counts_packed.shape[0] original_sequence_size = counts.shape[0] scores = np.ma.masked_all(original_sequence_size) # Start timing here. timing_start = datetime.now() # Sliding-window over packed sequence. for i in range(n, packed_sequence_size - n + 1): forward_window = counts_packed[i:i + n] backward_window = counts_packed[i - n:i] forward_density_obj = densratio(backward_window, forward_window, alpha=alpha, sigma_range=sigma_forward_range, lambda_range=lambda_forward_range, verbose=False) forward_divergence = forward_density_obj.alpha_PE backward_density_obj = densratio(forward_window, backward_window, alpha=alpha, sigma_range=sigma_backward_range, lambda_range=lambda_backward_range, verbose=False) backward_divergence = backward_density_obj.alpha_PE change_point_score = forward_divergence + backward_divergence # Use larger range of hyperparameters if we can't get a good fit with the smaller one. if change_point_score < 0: print 'Bad fit with forward sigma = %0.2f, backward sigma = %0.2f.' % ( forward_density_obj.kernel_info.sigma, backward_density_obj.kernel_info.sigma) sigma_range = np.array([ 0.7 * dmed, 0.8 * dmed, 0.9 * dmed, dmed, 1.1 * dmed, 1.2 * dmed, 1.3 * dmed ]) forward_density_obj = densratio(backward_window, forward_window, alpha=alpha, sigma_range=sigma_range, verbose=False) forward_divergence = forward_density_obj.alpha_PE backward_density_obj = densratio(forward_window, backward_window, alpha=alpha, sigma_range=sigma_range, verbose=False) backward_divergence = backward_density_obj.alpha_PE change_point_score = forward_divergence + backward_divergence print 'Tried again with forward sigma = %0.2f, backward sigma = %0.2f.' % ( forward_density_obj.kernel_info.sigma, backward_density_obj.kernel_info.sigma) scores[i + k // 2] = change_point_score print 'Change-point score at time %s computed as %0.4f.' % ( datetime.strftime(mdates.num2date(times[i]), '%d-%m-%Y/%H:%M'), scores[i]) # End time. timing_end = datetime.now() # Compute average time taken. total_time = (timing_end - timing_start).total_seconds() num_evals = packed_sequence_size - 2 * n + 1 print '%0.2f seconds taken for %d change-point score evaluations. Average is %0.2f evals/sec, with k = %d, and n = %d.' % \ (total_time, num_evals, num_evals/total_time, k, n) # Mask negative change-point scores. scores = np.ma.masked_less(scores, 0) # Plot change-point scores over windows as well as the original data. print 'Plotting...' fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True) plot_raw_ELS_data(fig, ax0, els_data_file, quantity, start_time, end_time, colorbar_range='subset', colorbar_orientation='horizontal') ax1.plot(times, scores) ax1.set_ylabel('Change-point Score') ax1.xaxis.set_tick_params(labelsize=8) # Place title below. fig.text(s='Change-point Scores for ELS Data \n k = %d, n = %d' % (k, n), x=0.5, y=0.03, horizontalalignment='center', fontsize=13) plt.subplots_adjust(bottom=0.3, left=0.2) # Save plot. if output_file is None: plt.show() else: plt.savefig(output_file, bbox_inches='tight') # Save scores. if save_to_file: rulsif_output_file = os.path.splitext( els_data_file)[0] + '_RuLSIF_output' arrays_with_names = {'scores': scores, 'times': times} np.savez(rulsif_output_file, **arrays_with_names)