Ejemplo n.º 1
0
def compute_perf_init(dataset_p, dataset_u, n_pos, n_unl, prior, nb_reps, name_path, nb_dummies=1):
    """Compute the performances of running the partial-GW for a PU learning
    task on a given dataset several times

    Parameters
    ----------
    dataset_p: name of the dataset among which the positives are drawn

    dataset_u: name of the dataset among which the unlabeled are drawn

    n_pos: number of points in the positive dataset

    n_unl: number of points in the unlabeled dataset

    prior: percentage of positives on the dataset (s)

    nb_resp: number of runs

    nb_dummies: number of dummy points, default: no dummies
        (to avoid numerical instabilities of POT)

    Returns
    -------
    dict with:
        - the class prior
        - the performances of the p-gw (avg among the repetitions)
        - the performances of the p-gw with group constraints (avg)
        - the list of all the nb_reps performances of the p-gw
        - the list of all the nb_reps performances of the p-gw with groups
    """

    # Init paths
    path = os.getcwd() + "/saved_plans"
    if not os.path.isdir(path):
        os.mkdir(path)
    path = path + "/" + name_path
    if not os.path.isdir(path):
        os.mkdir(path)

    for i in range(nb_reps):
        # Preprocess data
        P, U, y_u = utils.draw_p_u_dataset_scar(dataset_p, dataset_u, n_pos,
                                                n_unl, prior, i)  #     seed=i
        Ctot, C1, C2, mu, nu = compute_cost_matrices(P, U, prior, nb_dummies)

        # Store list of initialisation
        T = ot.emd(mu, nu, Ctot)

        # Compute th marginal (light in memory) and Save the best plan
        # marginal = np.sum(transp_emd_best[:n_pos,:], axis=0)
        np.save(path + f'/partial_gw_init_{dataset_p}_{n_pos}_{dataset_u}_{n_unl}_prior{prior}_reps{i}.npy',
                T)
    return
def prepare_initialisation(dataset_p, dataset_u, n_pos, n_unl, prior, nb_try):
    init_plan = torch.zeros([nb_try, n_pos, n_unl])
    for i in range(nb_try):
        # Draw dataset
        P, U, _ = utils.draw_p_u_dataset_scar(dataset_p,
                                              dataset_u,
                                              n_pos,
                                              n_unl,
                                              prior,
                                              seed_nb=i)
        Ctot, C1, C2, mu, nu = compute_cost_matrices(P,
                                                     U,
                                                     prior,
                                                     nb_dummies=10)
        # Compute init
        init_plan[i] = torch.tensor(ot.emd(mu, nu, Ctot)[:n_pos, :])
    return init_plan
Ejemplo n.º 3
0
def compute_perf_pgw(dataset_p, dataset_u, n_pos, n_unl, prior, nb_reps, name_path, nb_dummies=1):
    """Compute the performances of running the partial-GW for a PU learning
    task on a given dataset several times

    Parameters
    ----------
    dataset_p: name of the dataset among which the positives are drawn

    dataset_u: name of the dataset among which the unlabeled are drawn

    n_pos: number of points in the positive dataset

    n_unl: number of points in the unlabeled dataset

    prior: percentage of positives on the dataset (s)

    nb_resp: number of runs

    nb_dummies: number of dummy points, default: no dummies
        (to avoid numerical instabilities of POT)

    Returns
    -------
    dict with:
        - the class prior
        - the performances of the p-gw (avg among the repetitions)
        - the performances of the p-gw with group constraints (avg)
        - the list of all the nb_reps performances of the p-gw
        - the list of all the nb_reps performances of the p-gw with groups
    """

    # Init paths
    path = os.getcwd() + "/saved_plans"
    if not os.path.isdir(path):
        os.mkdir(path)
    path = path + "/" + name_path
    if not os.path.isdir(path):
        os.mkdir(path)

    for i in range(nb_reps):
        # Preprocess data
        P, U, y_u = utils.draw_p_u_dataset_scar(dataset_p, dataset_u, n_pos,
                                                n_unl, prior, i)  #     seed=i
        Ctot, C1, C2, mu, nu = compute_cost_matrices(P, U, prior, nb_dummies)
        nb_unl_pos = int(np.sum(y_u))

        # Store list of initialisation
        Ginit = []
        if Ctot is not None:
            T = ot.emd(mu, nu, Ctot)
            Ginit.append(T)  # We can init. with the EMD
        else:
            Ginit.append(None)
            _, Cs, _, ps, pt = compute_cost_matrices(P, U, prior, 0)
            Ginit = Ginit + initialisation_gw(ps, pt, Cs, U, prior, 10, nb_dummies)

        best_loss = 1e6
        # We test several init (emd if possible, outer product, barycenter)
        # and keep the one that provides the best loss
        transp_emd_best = None
        for init in Ginit:
            # Compute plan for given init
            transp_emd, t_loss = pu_gw_emd(C1, C2, mu, nu, nb_dummies, G0=init, log=True)

            # Keep the plan if it diminishes the loss
            if t_loss[-1] < best_loss:
                best_loss = t_loss[-1]
                transp_emd_best = transp_emd.copy()

        # Compute th marginal (light in memory) and Save the best plan
        # marginal = np.sum(transp_emd_best[:n_pos,:], axis=0)
        np.save(path + f'/partial_gw_plan_{dataset_p}_{n_pos}_{dataset_u}_{n_unl}_prior{prior}_reps{i}.npy',
                transp_emd_best)
    return
Ejemplo n.º 4
0
def compute_perf_pgw(dataset_p,
                     dataset_u,
                     n_pos,
                     n_unl,
                     prior,
                     nb_reps,
                     nb_dummies=1):
    """Compute the performances of running the partial-GW for a PU learning
    task on a given dataset several times

    Parameters
    ----------
    dataset_p: name of the dataset among which the positives are drawn

    dataset_u: name of the dataset among which the unlabeled are drawn

    n_pos: number of points in the positive dataset

    n_unl: number of points in the unlabeled dataset

    prior: percentage of positives on the dataset (s)

    nb_resp: number of runs

    nb_dummies: number of dummy points, default: no dummies
        (to avoid numerical instabilities of POT)

    Returns
    -------
    dict with:
        - the class prior
        - the performances of the p-gw (avg among the repetitions)
        - the performances of the p-gw with group constraints (avg)
        - the list of all the nb_reps performances of the p-gw
        - the list of all the nb_reps performances of the p-gw with groups
    """

    perfs = {}
    perfs['class_prior'] = prior
    perfs['pgw'] = 0
    perfs['pgw_groups'] = 0
    perfs_list = {}
    perfs_list['pgw'] = []
    perfs_list['pgw_groups'] = []
    start_time = time.time()
    for i in range(nb_reps):
        P, U, y_u = utils.draw_p_u_dataset_scar(dataset_p, dataset_u, n_pos,
                                                n_unl, prior, i)  #     seed=i
        Ctot, C1, C2, mu, nu = compute_cost_matrices(P, U, prior, nb_dummies)
        nb_unl_pos = int(np.sum(y_u))

        Ginit = []
        if Ctot is not None:
            Ginit.append(ot.emd(mu, nu, Ctot))  # We can init. with the EMD
        Ginit.append(None)
        _, Cs, _, ps, pt = compute_cost_matrices(P, U, prior, 0)
        Ginit = Ginit + initialisation_gw(ps, pt, Cs, U, prior, 10, nb_dummies)

        best_loss = 0
        # We test several init (emd if possible, outer product, barycenter)
        # and keep the one that provides the best loss
        for init in Ginit:
            transp_emd, t_loss = pu_gw_emd(C1,
                                           C2,
                                           mu,
                                           nu,
                                           nb_dummies,
                                           group_constraints=False,
                                           G0=init,
                                           log=True)
            y_hat = np.ones(len(y_u))
            sum_dummies = np.sum(transp_emd[-nb_dummies:], axis=0)
            y_hat[np.argsort(sum_dummies)[nb_unl_pos:]] = 0
            t_loss[-1] = np.mean(y_u == y_hat)
            if t_loss[-1] > best_loss:
                best_loss = t_loss[-1]
                transp_emd_best = transp_emd.copy()

        y_hat = np.ones(len(y_u))
        sum_dummies = np.sum(transp_emd_best[-nb_dummies:], axis=0)
        y_hat[np.argsort(sum_dummies)[nb_unl_pos:]] = 0
        perfs_list['pgw'].append(np.mean(y_u == y_hat))
        perfs['pgw'] += (np.mean(y_u == y_hat))

        transp_emd_group = pu_gw_emd(C1,
                                     C2,
                                     mu,
                                     nu,
                                     nb_dummies,
                                     group_constraints=True,
                                     G0=None)
        y_hat = np.ones(len(y_u))
        sum_dummies = np.sum(transp_emd_group[-nb_dummies:], axis=0)
        y_hat[np.argsort(sum_dummies)[nb_unl_pos:]] = 0
        perfs_list['pgw_groups'].append(np.mean(y_u == y_hat))
        perfs['pgw_groups'] += (np.mean(y_u == y_hat))

    perfs['pgw'] = perfs['pgw'] / nb_reps
    perfs['pgw_groups'] = perfs['pgw_groups'] / nb_reps
    perfs['time'] = time.time() - start_time
    return perfs, perfs_list
Ejemplo n.º 5
0
def compute_perf_emd(dataset_p,
                     dataset_u,
                     n_pos,
                     n_unl,
                     prior,
                     nb_reps,
                     nb_dummies=1):
    """Compute the performances of running the partial-W for a PU learning
    task on a given dataset several times

    Parameters
    ----------
    dataset_p: name of the dataset among which the positives are drawn

    dataset_u: name of the dataset among which the unlabeled are drawn

    n_pos: number of points in the positive dataset

    n_unl: number of points in the unlabeled dataset

    prior: percentage of positives on the dataset (s)

    nb_resp: number of runs

    nb_dummies: number of dummy points, default: no dummies
        (to avoid numerical instabilities of POT)

    Returns
    -------
    dict with:
        - the class prior
        - the performances of the p-w (avg among the repetitions)
        - the performances of the p-w with group constraints (avg)
        - the list of all the nb_reps performances of the p-w
        - the list of all the nb_reps performances of the p-w with groups
    """
    perfs = {}
    perfs['class_prior'] = prior
    perfs['emd'] = 0
    perfs['emd_groups'] = 0
    perfs_list = {}
    perfs_list['emd'] = []
    perfs_list['emd_groups'] = []
    start_time = time.time()
    for i in range(nb_reps):
        P, U, y_u = utils.draw_p_u_dataset_scar(dataset_p, dataset_u, n_pos,
                                                n_unl, prior, i)  # seed=i
        Ctot, _, _, mu, nu = compute_cost_matrices(P, U, prior, nb_dummies)
        nb_unl_pos = int(np.sum(y_u))

        transp_emd = ot.emd(mu, nu, Ctot)
        y_hat = np.ones(len(y_u))
        sum_dummies = np.sum(transp_emd[-nb_dummies:], axis=0)
        y_hat[np.argsort(sum_dummies)[nb_unl_pos:]] = 0
        perfs_list['emd'].append(np.mean(y_u == y_hat))
        perfs['emd'] += (np.mean(y_u == y_hat))

        transp_emd_group = pu_w_emd(mu, nu, Ctot, nb_dummies)
        y_hat = np.ones(len(y_u))
        sum_dummies = np.sum(transp_emd_group[-nb_dummies:], axis=0)
        y_hat[np.argsort(sum_dummies)[nb_unl_pos:]] = 0
        perfs_list['emd_groups'].append(np.mean(y_u == y_hat))
        perfs['emd_groups'] += (np.mean(y_u == y_hat))

    perfs['emd'] = perfs['emd'] / nb_reps
    perfs['emd_groups'] = perfs['emd_groups'] / nb_reps
    perfs['time'] = time.time() - start_time
    return perfs, perfs_list
Ejemplo n.º 6
0
            if not os.path.isfile(path + fname):
                print('skipped')
                continue
            print(f'treat {data_pos, data_unl, prior, eps, rho, rho2}')
            pi = np.load(path + fname)

            row = []
            row.append(data_pos)
            row.append(data_unl)
            row.append(prior)
            row.append(eps)
            row.append(rho)
            row.append(rho2)

            for i in range(nb_try):
                _, _, y_u = draw_p_u_dataset_scar(data_pos, data_unl, n_pos,
                                                  n_unl, prior, i)
                # Build prediction
                nu = pi[i]
                q = np.quantile(nu, 1 - prior)
                y_hat = nu > q
                row.append(accuracy_score(y_u, y_hat))

            df.loc[len(df)] = row

        # Save dataframe once processed
        df.to_csv(
            path +
            f'/perf_{data_pos}_{n_pos}_{data_unl}_{n_unl}_prior{prior}.csv')
        del df
    print('end')
def compute_plan_ugw(dataset_p,
                     dataset_u,
                     n_pos,
                     n_unl,
                     prior,
                     eps,
                     rho,
                     rho2,
                     nb_try,
                     solver,
                     device=0):
    # Set default type and GPU device
    torch.cuda.set_device(device)
    torch.set_default_tensor_type('torch.cuda.FloatTensor')

    # Draw cost for all seeds as batch
    Cx, Cy = torch.zeros([nb_try, n_pos,
                          n_pos]), torch.zeros([nb_try, n_unl, n_unl])
    for i in range(nb_try):
        P, U, y_u = utils.draw_p_u_dataset_scar(dataset_p,
                                                dataset_u,
                                                n_pos,
                                                n_unl,
                                                prior,
                                                seed_nb=i)  # seed=i
        P, U = torch.tensor(P.values,
                            dtype=torch.float), torch.tensor(U.values,
                                                             dtype=torch.float)
        cx, cy = euclid_dist(P, P), euclid_dist(U, U)
        Cx[i], Cy[i] = cx, cy
    del cx, cy

    # Compute init and weights
    mu = (torch.ones([n_pos]) / n_pos).expand(nb_try, -1)
    nu = (torch.ones([n_unl]) / n_unl).expand(nb_try, -1)
    if P.shape[1] == U.shape[1]:
        init_plan = prepare_initialisation(dataset_p, dataset_u, n_pos, n_unl,
                                           prior, nb_try)
    else:
        solv_init = BatchLowerBoundSolver(nits_sinkhorn=50000,
                                          tol_sinkhorn=1e-5,
                                          eps=eps,
                                          rho=rho,
                                          rho2=rho2)
        _, _, init_plan = solv_init.compute_plan(mu,
                                                 Cx,
                                                 nu,
                                                 Cy,
                                                 exp_form=False)

    # Compute the marginal and save as file
    pi_numpy = init_plan.sum(dim=1).cpu().data.numpy()
    fname = f'/ugw_init_{dataset_p}_{n_pos}_{dataset_u}_{n_unl}_prior{prior}_eps{eps}_rho{rho}_rho{rho2}_reps{nb_try}.npy'
    np.save(path + fname, pi_numpy)

    # Set params and start the grid wrt entropic param eps
    solver.set_rho(rho, rho2)
    solver.set_eps(eps)
    pi = solver.ugw_sinkhorn(mu, Cx, nu, Cy, init=init_plan)
    if torch.any(torch.isnan(pi)):
        raise Exception(
            f"Solver got NaN plan with params (eps, rho) = "
            f"{dataset_p, dataset_u, nb_try, solver.get_eps(), solver.get_rho()}"
        )

    # Compute the marginal and save as file
    pi_numpy = pi.sum(dim=1).cpu().data.numpy()
    fname = f'/ugw_plan_{dataset_p}_{n_pos}_{dataset_u}_{n_unl}_prior{prior}_eps{eps}_rho{rho}_rho{rho2}_reps{nb_try}.npy'
    np.save(path + fname, pi_numpy)

    print(
        f"DONE = Dataset {dataset_p, dataset_u}, eps = {eps}, rho = {rho, rho2} , reps = {nb_try}"
    )
    return