def compute_perf_init(dataset_p, dataset_u, n_pos, n_unl, prior, nb_reps, name_path, nb_dummies=1): """Compute the performances of running the partial-GW for a PU learning task on a given dataset several times Parameters ---------- dataset_p: name of the dataset among which the positives are drawn dataset_u: name of the dataset among which the unlabeled are drawn n_pos: number of points in the positive dataset n_unl: number of points in the unlabeled dataset prior: percentage of positives on the dataset (s) nb_resp: number of runs nb_dummies: number of dummy points, default: no dummies (to avoid numerical instabilities of POT) Returns ------- dict with: - the class prior - the performances of the p-gw (avg among the repetitions) - the performances of the p-gw with group constraints (avg) - the list of all the nb_reps performances of the p-gw - the list of all the nb_reps performances of the p-gw with groups """ # Init paths path = os.getcwd() + "/saved_plans" if not os.path.isdir(path): os.mkdir(path) path = path + "/" + name_path if not os.path.isdir(path): os.mkdir(path) for i in range(nb_reps): # Preprocess data P, U, y_u = utils.draw_p_u_dataset_scar(dataset_p, dataset_u, n_pos, n_unl, prior, i) # seed=i Ctot, C1, C2, mu, nu = compute_cost_matrices(P, U, prior, nb_dummies) # Store list of initialisation T = ot.emd(mu, nu, Ctot) # Compute th marginal (light in memory) and Save the best plan # marginal = np.sum(transp_emd_best[:n_pos,:], axis=0) np.save(path + f'/partial_gw_init_{dataset_p}_{n_pos}_{dataset_u}_{n_unl}_prior{prior}_reps{i}.npy', T) return
def prepare_initialisation(dataset_p, dataset_u, n_pos, n_unl, prior, nb_try): init_plan = torch.zeros([nb_try, n_pos, n_unl]) for i in range(nb_try): # Draw dataset P, U, _ = utils.draw_p_u_dataset_scar(dataset_p, dataset_u, n_pos, n_unl, prior, seed_nb=i) Ctot, C1, C2, mu, nu = compute_cost_matrices(P, U, prior, nb_dummies=10) # Compute init init_plan[i] = torch.tensor(ot.emd(mu, nu, Ctot)[:n_pos, :]) return init_plan
def compute_perf_pgw(dataset_p, dataset_u, n_pos, n_unl, prior, nb_reps, name_path, nb_dummies=1): """Compute the performances of running the partial-GW for a PU learning task on a given dataset several times Parameters ---------- dataset_p: name of the dataset among which the positives are drawn dataset_u: name of the dataset among which the unlabeled are drawn n_pos: number of points in the positive dataset n_unl: number of points in the unlabeled dataset prior: percentage of positives on the dataset (s) nb_resp: number of runs nb_dummies: number of dummy points, default: no dummies (to avoid numerical instabilities of POT) Returns ------- dict with: - the class prior - the performances of the p-gw (avg among the repetitions) - the performances of the p-gw with group constraints (avg) - the list of all the nb_reps performances of the p-gw - the list of all the nb_reps performances of the p-gw with groups """ # Init paths path = os.getcwd() + "/saved_plans" if not os.path.isdir(path): os.mkdir(path) path = path + "/" + name_path if not os.path.isdir(path): os.mkdir(path) for i in range(nb_reps): # Preprocess data P, U, y_u = utils.draw_p_u_dataset_scar(dataset_p, dataset_u, n_pos, n_unl, prior, i) # seed=i Ctot, C1, C2, mu, nu = compute_cost_matrices(P, U, prior, nb_dummies) nb_unl_pos = int(np.sum(y_u)) # Store list of initialisation Ginit = [] if Ctot is not None: T = ot.emd(mu, nu, Ctot) Ginit.append(T) # We can init. with the EMD else: Ginit.append(None) _, Cs, _, ps, pt = compute_cost_matrices(P, U, prior, 0) Ginit = Ginit + initialisation_gw(ps, pt, Cs, U, prior, 10, nb_dummies) best_loss = 1e6 # We test several init (emd if possible, outer product, barycenter) # and keep the one that provides the best loss transp_emd_best = None for init in Ginit: # Compute plan for given init transp_emd, t_loss = pu_gw_emd(C1, C2, mu, nu, nb_dummies, G0=init, log=True) # Keep the plan if it diminishes the loss if t_loss[-1] < best_loss: best_loss = t_loss[-1] transp_emd_best = transp_emd.copy() # Compute th marginal (light in memory) and Save the best plan # marginal = np.sum(transp_emd_best[:n_pos,:], axis=0) np.save(path + f'/partial_gw_plan_{dataset_p}_{n_pos}_{dataset_u}_{n_unl}_prior{prior}_reps{i}.npy', transp_emd_best) return
def compute_perf_pgw(dataset_p, dataset_u, n_pos, n_unl, prior, nb_reps, nb_dummies=1): """Compute the performances of running the partial-GW for a PU learning task on a given dataset several times Parameters ---------- dataset_p: name of the dataset among which the positives are drawn dataset_u: name of the dataset among which the unlabeled are drawn n_pos: number of points in the positive dataset n_unl: number of points in the unlabeled dataset prior: percentage of positives on the dataset (s) nb_resp: number of runs nb_dummies: number of dummy points, default: no dummies (to avoid numerical instabilities of POT) Returns ------- dict with: - the class prior - the performances of the p-gw (avg among the repetitions) - the performances of the p-gw with group constraints (avg) - the list of all the nb_reps performances of the p-gw - the list of all the nb_reps performances of the p-gw with groups """ perfs = {} perfs['class_prior'] = prior perfs['pgw'] = 0 perfs['pgw_groups'] = 0 perfs_list = {} perfs_list['pgw'] = [] perfs_list['pgw_groups'] = [] start_time = time.time() for i in range(nb_reps): P, U, y_u = utils.draw_p_u_dataset_scar(dataset_p, dataset_u, n_pos, n_unl, prior, i) # seed=i Ctot, C1, C2, mu, nu = compute_cost_matrices(P, U, prior, nb_dummies) nb_unl_pos = int(np.sum(y_u)) Ginit = [] if Ctot is not None: Ginit.append(ot.emd(mu, nu, Ctot)) # We can init. with the EMD Ginit.append(None) _, Cs, _, ps, pt = compute_cost_matrices(P, U, prior, 0) Ginit = Ginit + initialisation_gw(ps, pt, Cs, U, prior, 10, nb_dummies) best_loss = 0 # We test several init (emd if possible, outer product, barycenter) # and keep the one that provides the best loss for init in Ginit: transp_emd, t_loss = pu_gw_emd(C1, C2, mu, nu, nb_dummies, group_constraints=False, G0=init, log=True) y_hat = np.ones(len(y_u)) sum_dummies = np.sum(transp_emd[-nb_dummies:], axis=0) y_hat[np.argsort(sum_dummies)[nb_unl_pos:]] = 0 t_loss[-1] = np.mean(y_u == y_hat) if t_loss[-1] > best_loss: best_loss = t_loss[-1] transp_emd_best = transp_emd.copy() y_hat = np.ones(len(y_u)) sum_dummies = np.sum(transp_emd_best[-nb_dummies:], axis=0) y_hat[np.argsort(sum_dummies)[nb_unl_pos:]] = 0 perfs_list['pgw'].append(np.mean(y_u == y_hat)) perfs['pgw'] += (np.mean(y_u == y_hat)) transp_emd_group = pu_gw_emd(C1, C2, mu, nu, nb_dummies, group_constraints=True, G0=None) y_hat = np.ones(len(y_u)) sum_dummies = np.sum(transp_emd_group[-nb_dummies:], axis=0) y_hat[np.argsort(sum_dummies)[nb_unl_pos:]] = 0 perfs_list['pgw_groups'].append(np.mean(y_u == y_hat)) perfs['pgw_groups'] += (np.mean(y_u == y_hat)) perfs['pgw'] = perfs['pgw'] / nb_reps perfs['pgw_groups'] = perfs['pgw_groups'] / nb_reps perfs['time'] = time.time() - start_time return perfs, perfs_list
def compute_perf_emd(dataset_p, dataset_u, n_pos, n_unl, prior, nb_reps, nb_dummies=1): """Compute the performances of running the partial-W for a PU learning task on a given dataset several times Parameters ---------- dataset_p: name of the dataset among which the positives are drawn dataset_u: name of the dataset among which the unlabeled are drawn n_pos: number of points in the positive dataset n_unl: number of points in the unlabeled dataset prior: percentage of positives on the dataset (s) nb_resp: number of runs nb_dummies: number of dummy points, default: no dummies (to avoid numerical instabilities of POT) Returns ------- dict with: - the class prior - the performances of the p-w (avg among the repetitions) - the performances of the p-w with group constraints (avg) - the list of all the nb_reps performances of the p-w - the list of all the nb_reps performances of the p-w with groups """ perfs = {} perfs['class_prior'] = prior perfs['emd'] = 0 perfs['emd_groups'] = 0 perfs_list = {} perfs_list['emd'] = [] perfs_list['emd_groups'] = [] start_time = time.time() for i in range(nb_reps): P, U, y_u = utils.draw_p_u_dataset_scar(dataset_p, dataset_u, n_pos, n_unl, prior, i) # seed=i Ctot, _, _, mu, nu = compute_cost_matrices(P, U, prior, nb_dummies) nb_unl_pos = int(np.sum(y_u)) transp_emd = ot.emd(mu, nu, Ctot) y_hat = np.ones(len(y_u)) sum_dummies = np.sum(transp_emd[-nb_dummies:], axis=0) y_hat[np.argsort(sum_dummies)[nb_unl_pos:]] = 0 perfs_list['emd'].append(np.mean(y_u == y_hat)) perfs['emd'] += (np.mean(y_u == y_hat)) transp_emd_group = pu_w_emd(mu, nu, Ctot, nb_dummies) y_hat = np.ones(len(y_u)) sum_dummies = np.sum(transp_emd_group[-nb_dummies:], axis=0) y_hat[np.argsort(sum_dummies)[nb_unl_pos:]] = 0 perfs_list['emd_groups'].append(np.mean(y_u == y_hat)) perfs['emd_groups'] += (np.mean(y_u == y_hat)) perfs['emd'] = perfs['emd'] / nb_reps perfs['emd_groups'] = perfs['emd_groups'] / nb_reps perfs['time'] = time.time() - start_time return perfs, perfs_list
if not os.path.isfile(path + fname): print('skipped') continue print(f'treat {data_pos, data_unl, prior, eps, rho, rho2}') pi = np.load(path + fname) row = [] row.append(data_pos) row.append(data_unl) row.append(prior) row.append(eps) row.append(rho) row.append(rho2) for i in range(nb_try): _, _, y_u = draw_p_u_dataset_scar(data_pos, data_unl, n_pos, n_unl, prior, i) # Build prediction nu = pi[i] q = np.quantile(nu, 1 - prior) y_hat = nu > q row.append(accuracy_score(y_u, y_hat)) df.loc[len(df)] = row # Save dataframe once processed df.to_csv( path + f'/perf_{data_pos}_{n_pos}_{data_unl}_{n_unl}_prior{prior}.csv') del df print('end')
def compute_plan_ugw(dataset_p, dataset_u, n_pos, n_unl, prior, eps, rho, rho2, nb_try, solver, device=0): # Set default type and GPU device torch.cuda.set_device(device) torch.set_default_tensor_type('torch.cuda.FloatTensor') # Draw cost for all seeds as batch Cx, Cy = torch.zeros([nb_try, n_pos, n_pos]), torch.zeros([nb_try, n_unl, n_unl]) for i in range(nb_try): P, U, y_u = utils.draw_p_u_dataset_scar(dataset_p, dataset_u, n_pos, n_unl, prior, seed_nb=i) # seed=i P, U = torch.tensor(P.values, dtype=torch.float), torch.tensor(U.values, dtype=torch.float) cx, cy = euclid_dist(P, P), euclid_dist(U, U) Cx[i], Cy[i] = cx, cy del cx, cy # Compute init and weights mu = (torch.ones([n_pos]) / n_pos).expand(nb_try, -1) nu = (torch.ones([n_unl]) / n_unl).expand(nb_try, -1) if P.shape[1] == U.shape[1]: init_plan = prepare_initialisation(dataset_p, dataset_u, n_pos, n_unl, prior, nb_try) else: solv_init = BatchLowerBoundSolver(nits_sinkhorn=50000, tol_sinkhorn=1e-5, eps=eps, rho=rho, rho2=rho2) _, _, init_plan = solv_init.compute_plan(mu, Cx, nu, Cy, exp_form=False) # Compute the marginal and save as file pi_numpy = init_plan.sum(dim=1).cpu().data.numpy() fname = f'/ugw_init_{dataset_p}_{n_pos}_{dataset_u}_{n_unl}_prior{prior}_eps{eps}_rho{rho}_rho{rho2}_reps{nb_try}.npy' np.save(path + fname, pi_numpy) # Set params and start the grid wrt entropic param eps solver.set_rho(rho, rho2) solver.set_eps(eps) pi = solver.ugw_sinkhorn(mu, Cx, nu, Cy, init=init_plan) if torch.any(torch.isnan(pi)): raise Exception( f"Solver got NaN plan with params (eps, rho) = " f"{dataset_p, dataset_u, nb_try, solver.get_eps(), solver.get_rho()}" ) # Compute the marginal and save as file pi_numpy = pi.sum(dim=1).cpu().data.numpy() fname = f'/ugw_plan_{dataset_p}_{n_pos}_{dataset_u}_{n_unl}_prior{prior}_eps{eps}_rho{rho}_rho{rho2}_reps{nb_try}.npy' np.save(path + fname, pi_numpy) print( f"DONE = Dataset {dataset_p, dataset_u}, eps = {eps}, rho = {rho, rho2} , reps = {nb_try}" ) return