Python preprocess_datasets Exemples, utils.preprocessor.preprocess_datasets Python Exemples

Exemple #1

0

Afficher le fichier

    def run_simulations(self, num_sims: int = 5) -> None:
        """Train mf."""
        results_mse = []
        results_mae = []
        results_ndcg = []

        # start running simulations
        start = time.time()
        for seed in np.arange(num_sims):
            train, val, test, num_users, num_items =\
                preprocess_datasets(data=self.data, seed=seed)

            ops.reset_default_graph()
            tf.set_random_seed(seed)
            sess = tf.Session()
            if '-at' not in self.model_name:
                model = MFIPS(num_users=num_users, num_items=num_items,
                              dim=self.dim, lam=self.lam, eta=self.eta)
                _, mse, mae, u_emb, i_emb, i_bias = train_mfips(
                    sess, model=model, data=self.data, train=train, val=val, test=test,
                    max_iters=self.max_iters, batch_size=self.batch_size,
                    model_name=self.model_name, seed=seed)
            else:
                model = MFIPS(num_users=num_users, num_items=num_items,
                              dim=self.dim, lam=self.lam, eta=self.eta, num=0)
                model1 = MFIPS(num_users=num_users, num_items=num_items,
                               dim=self.dim, lam=self.lam, eta=self.eta, num=1)
                model2 = MFIPS(num_users=num_users, num_items=num_items,
                               dim=self.dim, lam=self.lam, eta=self.eta, num=2)
                _, mse, mae, u_emb, i_emb, i_bias = train_mfips_with_at(
                    sess, model=model, mfips1=model1, mfips2=model2, data=self.data,
                    train=train, val=val, test=test, epsilon=self.epsilon,
                    pre_iters=self.pre_iters, post_iters=self.post_iters, post_steps=self.post_steps,
                    batch_size=self.batch_size, model_name=self.model_name, seed=seed)
            results_mae.append(mae)
            results_mse.append(mse)
            ndcg = aoa_evaluator(user_embed=u_emb, item_embed=i_emb, item_bias=i_bias, test=test)
            results_ndcg.append(ndcg)
            print(f'#{seed+1} {self.model_name}: {np.round((time.time() - start) / 60, 2)} min')
        # aggregate and save the final results
        result_path = Path(f'../logs/{self.data}/{self.model_name}')
        result_path.mkdir(parents=True, exist_ok=True)
        pd.concat([pd.DataFrame(results_mae, columns=['MAE']),
                   pd.DataFrame(results_mse, columns=['MSE']),
                   pd.DataFrame(results_ndcg, columns=['nDCG@3'])], 1)\
            .to_csv(str(result_path / 'results.csv'))

Exemple #2

0

Afficher le fichier

def summarize_data_statistics() -> None:
    """Save dataset statistics with Tex Table Format."""
    stat_data_list = []
    Path('../paper_results').mkdir(exist_ok=True)
    for data in datasets:
        train, _, test, num_users, num_items = preprocess_datasets(data=data)
        num_data = train.shape[0]
        spasity = f'{100 * (num_data / (num_users * num_items)).round(4)}%'
        avg_train, avg_test = train[:,
                                    2].mean().round(3), test[:,
                                                             2].mean().round(3)
        kl = calc_kl_div(train, test)
        stat_data = DataFrame(data=[
            num_users, num_items, num_data, spasity, avg_train, avg_test, kl
        ],
                              index=stats_idx,
                              columns=[data]).T
        stat_data_list.append(stat_data)
    pd.concat(stat_data_list).to_csv('../paper_results/data_stat.csv', sep='&')

Exemple #3

0

Afficher le fichier

    def __call__(self, trial: Trial) -> float:
        """Calculate an objective value."""
        train, val, test, num_users, num_items =\
            preprocess_datasets(data=self.data, seed=12345)

        # sample a set of hyperparameters.
        config = yaml.safe_load(open('../config.yaml', 'r'))
        eta = config['eta']
        max_iters = config['max_iters']
        batch_size = config['batch_size']
        pre_iters = config['pre_iters']
        post_iters = config['post_iters']
        post_steps = config['post_steps']
        dim = trial.suggest_discrete_uniform('dim', 5, 50, 5)
        lam = trial.suggest_loguniform('lam', 1e-6, 1)
        if '-at' in self.model_name:
            epsilon = trial.suggest_loguniform('epsilon', 1e-3, 1)

        ops.reset_default_graph()
        tf.set_random_seed(12345)
        sess = tf.Session()
        if '-at' not in self.model_name:
            model = MFIPS(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta)
            score, _, _, _, _, _ = train_mfips(
                sess, model=model, data=self.data, train=train, val=val, test=test,
                max_iters=max_iters, batch_size=batch_size, model_name=self.model_name)
        else:
            model = MFIPS(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta, num=0)
            model1 = MFIPS(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta, num=1)
            model2 = MFIPS(num_users=num_users, num_items=num_items, dim=dim, lam=lam, eta=eta, num=2)
            score, _, _, _, _, _ = train_mfips_with_at(
                sess, model=model, mfips1=model1, mfips2=model2, data=self.data,
                train=train, val=val, test=test, epsilon=epsilon,
                pre_iters=pre_iters, post_iters=post_iters, post_steps=post_steps,
                batch_size=batch_size, model_name=self.model_name)

        return score

Exemple #4

0

Afficher le fichier

Fichier : trainer.py Projet : aaditkamat/unbiased-recsys-with-implicit-feedback

    def run_simulations(self, num_sims: int = 5) -> None:
        """Train mf"""
        logger = logging.getLogger(__name__)  # Create a custom logger
        # Create logging handlers
        c_handler = logging.StreamHandler()
        f_handler = logging.FileHandler(
            Path(f'../logs/{self.data}/{self.model_name}/simulations.log'),
            mode='w')

        # Create logging formatters and add them to handlers
        c_format = logging.Formatter('%(message)s')
        f_format = logging.Formatter('%(message)s')
        c_handler.setFormatter(c_format)
        f_handler.setFormatter(f_format)

        logger.addHandler(c_handler)
        logger.addHandler(f_handler)

        results_mse = []
        results_mae = []
        results_ndcg_at1, results_ndcg_at3, results_ndcg_at5 = [], [], []
        results_recall_at1, results_recall_at3, results_recall_at5 = [], [], []
        results_map_at1, results_map_at3, results_map_at5 = [], [], []

        # start running simulations
        start = time.time()

        for seed in np.arange(num_sims):
            train, val, test, num_users, num_items =\
                preprocess_datasets(data=self.data, seed=seed)

            ops.reset_default_graph()
            tf.set_random_seed(seed)
            sess = tf.Session()

            if '-without_ipw' in self.model_name:
                logger.debug('*** Without IPW ***')

                # instantiating the model in the strategy scope creates the model on the TPU
                model = MFMODEL(num_users=num_users,
                                num_items=num_items,
                                dim=self.dim,
                                lam=self.lam,
                                eta=self.eta)
                _, mse, mae, u_emb, i_emb, u_bias, i_bias, g_bias = train_mfmodel_without_ipw(
                    sess,
                    model=model,
                    data=self.data,
                    train=train,
                    val=val,
                    test=test,
                    max_iters=self.max_iters,
                    batch_size=self.batch_size,
                    model_name=self.model_name,
                    seed=seed)

            elif '-at' not in self.model_name:
                logger.debug(
                    '*** With IPW and without Asymmetric Tri-training ***')

                # instantiating the model in the strategy scope creates the model on the TPU
                model = MFMODEL(num_users=num_users,
                                num_items=num_items,
                                dim=self.dim,
                                lam=self.lam,
                                eta=self.eta)
                _, mse, mae, u_emb, i_emb, u_bias, i_bias, g_bias = train_mfmodel(
                    sess,
                    model=model,
                    data=self.data,
                    train=train,
                    val=val,
                    test=test,
                    max_iters=self.max_iters,
                    batch_size=self.batch_size,
                    model_name=self.model_name,
                    seed=seed)
            else:
                logger.debug('*** With IPW and Asymmetric Tri-training ***')

                model = MFMODEL(num_users=num_users,
                                num_items=num_items,
                                dim=self.dim,
                                lam=self.lam,
                                eta=self.eta,
                                num=0)
                model1 = MFMODEL(num_users=num_users,
                                 num_items=num_items,
                                 dim=self.dim,
                                 lam=self.lam,
                                 eta=self.eta,
                                 num=1)
                model2 = MFMODEL(num_users=num_users,
                                 num_items=num_items,
                                 dim=self.dim,
                                 lam=self.lam,
                                 eta=self.eta,
                                 num=2)

                _, mse, mae, u_emb, i_emb, u_bias, i_bias, g_bias = train_mfmodel_with_at(
                    sess,
                    model=model,
                    mfmodel1=model1,
                    mfmodel2=model2,
                    data=self.data,
                    train=train,
                    val=val,
                    test=test,
                    epsilon=self.epsilon,
                    pre_iters=self.pre_iters,
                    post_iters=self.post_iters,
                    post_steps=self.post_steps,
                    batch_size=self.batch_size,
                    model_name=self.model_name,
                    seed=seed)

            # After building a model, summarize metrics
            results_mae.append(mae)
            results_mse.append(mse)
            ranking_results_dic = aoa_evaluator_all_biases(
                user_embed=u_emb,
                item_embed=i_emb,
                user_bias=u_bias,
                item_bias=i_bias,
                global_bias=g_bias,
                test=test,
                model_name=self.model_name)

            results_ndcg_at1.append(ranking_results_dic['nDCG@1'])
            results_ndcg_at3.append(ranking_results_dic['nDCG@3'])
            results_ndcg_at5.append(ranking_results_dic['nDCG@5'])
            results_recall_at1.append(ranking_results_dic['Recall@1'])
            results_recall_at3.append(ranking_results_dic['Recall@3'])
            results_recall_at5.append(ranking_results_dic['Recall@5'])
            results_map_at1.append(ranking_results_dic['MAP@1'])
            results_map_at3.append(ranking_results_dic['MAP@3'])
            results_map_at5.append(ranking_results_dic['MAP@5'])
            logger.debug(
                f'#{seed+1} {self.model_name}: {np.round((time.time() - start) / 60, 2)} min'
            )

        # Aggregate and save the final results
        result_path = Path(f'../logs/{self.data}/{self.model_name}')
        result_path.mkdir(parents=True, exist_ok=True)
        pd.concat([pd.DataFrame(results_mae, columns=['MAE']),
                   pd.DataFrame(results_mse, columns=['MSE']),
                   pd.DataFrame(results_ndcg_at1, columns=['nDCG@1']),
                   pd.DataFrame(results_ndcg_at3, columns=['nDCG@3']),
                   pd.DataFrame(results_ndcg_at5, columns=['nDCG@5']),
                   pd.DataFrame(results_recall_at1, columns=['Recall@1']),
                   pd.DataFrame(results_recall_at3, columns=['Recall@3']),
                   pd.DataFrame(results_recall_at5, columns=['Recall@5']),
                   pd.DataFrame(results_map_at1, columns=['MAP@1']),
                   pd.DataFrame(results_map_at3, columns=['MAP@3']),
                   pd.DataFrame(results_map_at5, columns=['MAP@5']),
                   ], 1
                   )\
            .to_csv(str(result_path / 'results.csv'))

Exemple #5

0

Afficher le fichier

Fichier : trainer.py Projet : aaditkamat/unbiased-recsys-with-implicit-feedback

    def __call__(self, trial: Trial) -> float:
        """Calculate an objective value."""

        logger = logging.getLogger(__name__)  # Create a custom logger

        # Create logging handlers
        c_handler = logging.StreamHandler()
        f_handler = logging.FileHandler(
            Path(f'../logs/{self.data}/{self.model_name}/simulations.log'),
            mode='w')

        # Create logging formatters and add them to handlers
        c_format = logging.Formatter('%(message)s')
        f_format = logging.Formatter('%(message)s')
        c_handler.setFormatter(c_format)
        f_handler.setFormatter(f_format)

        logger.addHandler(c_handler)
        logger.addHandler(f_handler)

        train, val, test, num_users, num_items =\
            preprocess_datasets(data=self.data, seed=rand_seed_val)

        # sample a set of hyperparameters.
        config = yaml.safe_load(open('../config.yaml', 'r'))
        eta = config['eta']
        max_iters = config['max_iters']
        batch_size = config['batch_size']
        pre_iters = config['pre_iters']
        post_iters = config['post_iters']
        post_steps = config['post_steps']
        dim = trial.suggest_discrete_uniform('dim', 5, 50, 5)
        lam = trial.suggest_loguniform('lam', 1e-6, 1)
        if '-at' in self.model_name:
            epsilon = trial.suggest_loguniform('epsilon', 1e-3, 1)

        ops.reset_default_graph()
        tf.set_random_seed(rand_seed_val)
        sess = tf.Session()
        if '-without_ipw' in self.model_name:
            logger.debug('*** Without IPW ***')
            model = MFMODEL(num_users=num_users,
                            num_items=num_items,
                            dim=dim,
                            lam=lam,
                            eta=eta)
            score, _, _, _, _, _, _, _ = train_mfmodel_without_ipw(
                sess,
                model=model,
                data=self.data,
                train=train,
                val=val,
                test=test,
                max_iters=max_iters,
                batch_size=batch_size,
                model_name=self.model_name)
        elif '-at' not in self.model_name:
            logger.debug(
                '*** With IPW and without Asymmetric Tri-training ***')
            model = MFMODEL(num_users=num_users,
                            num_items=num_items,
                            dim=dim,
                            lam=lam,
                            eta=eta)
            score, _, _, _, _, _, _, _ = train_mfmodel(
                sess,
                model=model,
                data=self.data,
                train=train,
                val=val,
                test=test,
                max_iters=max_iters,
                batch_size=batch_size,
                model_name=self.model_name)
        else:
            logger.debug('*** With IPW and Asymmetric Tri-training ***')
            model = MFMODEL(num_users=num_users,
                            num_items=num_items,
                            dim=dim,
                            lam=lam,
                            eta=eta,
                            num=0)
            model1 = MFMODEL(num_users=num_users,
                             num_items=num_items,
                             dim=dim,
                             lam=lam,
                             eta=eta,
                             num=1)
            model2 = MFMODEL(num_users=num_users,
                             num_items=num_items,
                             dim=dim,
                             lam=lam,
                             eta=eta,
                             num=2)
            score, _, _, _, _, _, _, _ = train_mfmodel_with_at(
                sess,
                model=model,
                mfmodel1=model1,
                mfmodel2=model2,
                data=self.data,
                train=train,
                val=val,
                test=test,
                epsilon=epsilon,
                pre_iters=pre_iters,
                post_iters=post_iters,
                post_steps=post_steps,
                batch_size=batch_size,
                model_name=self.model_name)

        return score

Exemple #6

0

Afficher le fichier

"""
Code for summarizing experimental results for the paper
"Asymmetric Tri-training for Debiasing Missing-Not-At-Random Rating Feedback".
"""
import argparse
import yaml

from utils.preprocessor import preprocess_datasets
from utils.result_tools import (summarize_data_statistics,
                                summarize_experimental_results)

parser = argparse.ArgumentParser()
parser.add_argument('--datasets', '-d', type=str, nargs='*', required=True)

if __name__ == "__main__":
    args = parser.parse_args()
    num_sims = yaml.safe_load(open('../config.yaml', 'rb'))['num_sims']

    summarize_data_statistics()
    for data in args.datasets:
        train, _, test, _, _ = preprocess_datasets(data=data)
        summarize_experimental_results(data=data)