Ejemplo n.º 1
0
    def write_gp_model(cls,
                       gp_model,
                       method=SBO_METHOD,
                       n_samples_parameters=0,
                       name_model='gp_fitting_gaussian'):
        """
        Write the gp_model after new points are added.

        :param gp_model: gp model instance
        :param method: (str)
        :param n_samples_parameters: int
        :param name_model: (str)
        """
        model_type = cls._model_map[name_model]

        f_name = cls._get_filename_modified(model_type, gp_model.problem_name,
                                            gp_model.type_kernel,
                                            gp_model.training_name, method,
                                            n_samples_parameters)

        gp_dir = path.join(GP_DIR, gp_model.problem_name)

        if not os.path.exists(gp_dir):
            os.mkdir(gp_dir)

        gp_path = path.join(gp_dir, f_name)

        JSONFile.write(gp_model.serialize(), gp_path)
    def add_point(self, point, model_objective_value):
        """

        :param point: np.array(k)
        :param model_objective_value: float

        :return: float (optimal value)
        """

        self.evaluated_points.append(list(point))
        self.model_objective_values.append(model_objective_value)

        eval = self.evaluate_objective(
            self.module,
            list(point),
            n_samples=self.n_samples,
            objective_function=self.objective_function)
        self.objective_values.append(eval[0])

        if self.noise:
            self.standard_deviation_evaluations.append(eval[1])

        data = self.serialize()
        JSONFile.write(data, self.file_path)

        return eval[0]
Ejemplo n.º 3
0
def train_nn(model, n_epochs=20, name_model='a.json', random_seed=1):
    np.random.seed(1)
    values = {}
    for epoch in range(1, n_epochs + 1):
        logger.info('epoch is %d' % epoch)
        values[epoch] = []
        optimizer = optim.SGD(model.parameters(), lr=(0.1 / np.sqrt(epoch)),
                              momentum=args_opt['momentum'])
        shuffled_order = np.arange(len(train_dict))
        np.random.shuffle(shuffled_order)
        for i in shuffled_order:
            total = 0
            correct = 0
            for data in train_test:
                images, labels = data
                outputs = model(Variable(images))
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum()
            values[epoch].append(100. * correct / float(total))

            logger.info('Error in epoch %d is:' % epoch)
            logger.info(100. * correct / float(total))

            data, target = train_dict[i]
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

        f_name = 'data/multi_start/neural_networks/training_results/'
        f_name += name_model
        JSONFile.write(values, f_name)
    def assign_categories(cls, list_papers, year, month):
        """

        :param list_papers: [str]
        :return: {paper_name (str):  category (str)}
        """
        papers = {}
        for paper in list_papers:

            before_2007 = False
            arxiv_id = paper

            if '/' in arxiv_id:
                before_2007 = True
                index = arxiv_id.index('/')
                cat = arxiv_id[0:index]
                arxiv_id = arxiv_id[index + 1:]

            if 'v' in arxiv_id:
                index = arxiv_id.rfind('v')
                arxiv_id = arxiv_id[0:index]

            if not before_2007:
                cat = cls.get_cats(arxiv_id, arxiv_id[0:2], arxiv_id[2:4])

            papers[paper] = cat

        JSONFile.write(papers, cls._name_file_categories(year=year,
                                                         month=month))
        return papers
    def save_data(self, sufix=None):
        data = {}
        data['chosen_points'] = self.chosen_points
        data['evaluations'] = self.evaluations_obj
        data['parameters'] = self.parameters
        data['chosen_index'] = self.chosen_index

        file_name = 'data/multi_start/'

        file_name += self.problem_name + '/'
        if sufix is None:
            sufix = self.name_model

        if not os.path.exists(file_name):
            os.mkdir(file_name)

        file_name += 'hutter_greedy_policy/'

        if not os.path.exists(file_name):
            os.mkdir(file_name)

        file_name += '/' + sufix

        if self.random_seed is not None:
            file_name += '_random_seed_' + str(self.random_seed)

        if self.n_restarts is not None:
            file_name += '_n_restarts_' + str(self.n_restarts)


        JSONFile.write(data, file_name + '.json')

        for i in self.dict_stat_models:
            model = self.dict_stat_models[i]
            model.save_model(str(i))
Ejemplo n.º 6
0
    def save_data(self, sufix=None):
        data = {}
        data['chosen_points'] = self.chosen_points
        data['evaluations'] = self.evaluations_obj
        data['chosen_index'] = self.chosen_index

        file_name = 'data/multi_start/'

        file_name += self.problem_name + '/'

        if sufix is None:
            sufix = self.name_model

        if not os.path.exists(file_name):
            os.mkdir(file_name)

        file_name += 'random_policy' + '/'

        if not os.path.exists(file_name):
            os.mkdir(file_name)

        file_name += sufix

        if self.random_seed is not None:
            file_name += '_random_seed_' + str(self.random_seed)

        if self.n_restarts is not None:
            file_name += '_n_restarts_' + str(self.n_restarts)

        JSONFile.write(data, file_name + '.json')
Ejemplo n.º 7
0
    def top_users_papers_selecting_categories(cls,
                                              year,
                                              month,
                                              top_categories=10,
                                              different_papers=20):
        """
        Selects only users and papers in the top_categories based on the data generated by the
        previous function.
        :param year:
        :param month:
        :param top_categories:
        :param different_papers
        :return: [ {'paper': (int) number of times seen},
            {'user': {'stats': ((int) # entries, (int) # different papers in the top_n papers),
                      'diff_papers': [str]
                }
            }
        ]
        """
        categories = JSONFile.read(
            cls._name_file_categories(year=year, month=month))
        papers_cat = pd.DataFrame.from_records([categories]).transpose()

        users_cg, user_cat = cls.assign_categories_to_users(year, month)
        user_cat = pd.DataFrame.from_records([user_cat]).transpose()

        pap_1 = set(user_cat[0].sort_values().index.values[-top_categories:])
        pap_2 = set(papers_cat[0].value_counts().sort_values().index.
                    values[-top_categories:])

        top_cat = pap_1.intersection(pap_2)

        full_data = JSONFile.read(cls._name_file_final(year=year, month=month))

        papers_or = full_data[0]
        papers_new = {}
        for paper in papers_or:
            cat = categories[paper]
            if cat in top_cat:
                papers_new[paper] = papers_or[paper]

        users_new = {}
        for user in full_data[1]:
            paper_user = []
            for paper in full_data[1][user]['diff_papers']:
                cat = categories[paper]
                if cat in top_cat:
                    paper_user.append(paper)
            if len(paper_user) > different_papers:
                users_new[user] = full_data[1][user]
                users_new[user]['diff_papers'] = paper_user

        file_name = cls._name_file_final_categ(year=year, month=month)
        JSONFile.write([papers_new, users_new], file_name)

        logger.info('Number of papers is %d' % len(papers_new))
        logger.info('Number of users is %d' % len(users_new))

        return [papers_new, users_new]
    def accuracy(self,
                 gp_model,
                 start=3,
                 iterations=21,
                 sufix=None,
                 model=None):
        #TODO: UPDATE THIS FUNCTION. NOW IT'S WRONG!!
        means = {}
        cis = {}
        values_observed = {}

        mean, std, ci = self.compute_posterior_params_marginalize(gp_model)
        means[start] = mean
        cis[start] = ci
        values_observed[start] = gp_model.raw_results['values'][-1]

        for i in range(start, iterations):
            print(i)
            if len(gp_model.raw_results) < i + 1:
                data_new = self.get_value_next_iteration(i + 1, **self.kwargs)
                self.add_observations(gp_model, i + 1, data_new['value'],
                                      data_new['point'], data_new['gradient'])
            mean, std, ci = self.compute_posterior_params_marginalize(gp_model)
            means[i + 1] = mean
            cis[i + 1] = ci
            values_observed[i + 1] = data_new['value']
            print mean, ci
            value_tmp = self.get_value_next_iteration(i + 1, **self.kwargs)
            print value_tmp

            accuracy_results = {}
            accuracy_results['means'] = means
            accuracy_results['ci'] = cis
            accuracy_results['values_observed'] = values_observed
            file_name = 'data/multi_start/accuracy_results/stat_model'

            if not os.path.exists('data/multi_start'):
                os.mkdir('data/multi_start')

            if not os.path.exists('data/multi_start/accuracy_results'):
                os.mkdir('data/multi_start/accuracy_results/')

            if not os.path.exists('data/multi_start/accuracy_results/' +
                                  self.problem_name):
                os.mkdir('data/multi_start/accuracy_results/' +
                         self.problem_name)

            if sufix is None:
                sufix = self.specifications
            file_name = 'data/multi_start/accuracy_results/' + self.problem_name + '/' + sufix

            JSONFile.write(accuracy_results, file_name + '.json')

        return means, cis, values_observed
Ejemplo n.º 9
0
    def get_training_data(cls, year, month, random_seed=1):
        """
        Creates a file with the training data:
            [[user_id, paper_id, rating]], where rating is 1 if the paper wasn't seen by the user,
            or 2 otherwise.

        :param year: str
        :param month: str (e.g. '1', '12')
        :param random_seed: int

        """
        random.seed(random_seed)
        file_name = cls._name_file_final_categ(year=year, month=month)
        data = JSONFile.read(file_name)

        papers = data[0].keys()

        users_data = data[1]
        users = users_data.keys()

        training_data = []

        key_paper = {}
        for i, paper in enumerate(papers):
            key_paper[paper] = i + 1

        for i, user in enumerate(users):
            for paper in users_data[user]['diff_papers']:
                training_data.append([i + 1, key_paper[paper], 2])

            other_papers = list(
                set(papers) - set(users_data[user]['diff_papers']))
            index_papers = range(len(other_papers))
            random.shuffle(index_papers)
            seen_papers = len(set(users_data[user]['diff_papers']))

            dislike_papers = np.random.randint(
                int(0.5 * seen_papers),
                min(int(1.8 * seen_papers), len(index_papers)), 1)

            index = dislike_papers[0]

            keep_index_papers = index_papers[0:index]
            for index in keep_index_papers:
                training_data.append(
                    [i + 1, key_paper[other_papers[index]], 1])

        file_name = cls._name_training_data(year=year, month=month)

        logger.info('There are %d training points' % len(training_data))
        JSONFile.write(training_data, file_name)
Ejemplo n.º 10
0
    def accuracy(self,
                 gp_model,
                 start=3,
                 iterations=21,
                 sufix=None,
                 model=None):
        means = {}
        cis = {}

        mean, std, ci = self.compute_posterior_params_marginalize(gp_model)
        means[start] = mean
        cis[start] = ci

        for i in range(start, iterations):
            print(i)
            if len(gp_model.raw_results) < i + 1:
                data_new = self.get_value_next_iteration(i + 1, **self.kwargs)
                self.add_observations(gp_model, i + 1, data_new['value'],
                                      data_new['point'], data_new['gradient'])
            mean, std, ci = self.compute_posterior_params_marginalize(gp_model)
            means[i + 1] = mean
            cis[i + 1] = ci

            print mean, ci
            value_tmp = self.get_value_next_iteration(i + 1, **self.kwargs)
            print value_tmp

        accuracy_results = {}
        accuracy_results['means'] = means
        accuracy_results['ci'] = cis
        file_name = 'data/multi_start/accuracy_results/stat_model'

        if not os.path.exists('data/multi_start'):
            os.mkdir('data/multi_start')

        if not os.path.exists('data/multi_start/accuracy_results'):
            os.mkdir('data/multi_start/accuracy_results')

        if self.problem_name is not None:
            file_name += '_' + self.problem_name

        if sufix is not None:
            file_name += '_' + sufix

        JSONFile.write(accuracy_results, file_name + '.json')

        return means, cis
    def get_points_domain(cls,
                          n_training,
                          bounds_domain,
                          random_seed,
                          training_name,
                          problem_name,
                          type_bounds=None,
                          simplex_domain=None):
        """
        Get random points in the domain.

        :param n_training: (int) Number of points
        :param bounds_domain: [([float, float] or [float])], the first case is when the bounds are
            lower or upper bound of the respective entry; in the second case, it's list of finite
            points representing the domain of that entry.
        :param random_seed: (int)
        :param training_name: (str), prefix used to save the training data.
        :param problem_name: str
        :param type_bounds: [0 or 1], 0 if the bounds are lower or upper bound of the respective
            entry, 1 if the bounds are all the finite options for that entry.
        :return: [[float]]
        """

        file_name = cls._filename_domain(
            problem_name=problem_name,
            training_name=training_name,
            n_points=n_training,
            random_seed=random_seed,
        )

        training_dir = path.join(PROBLEM_DIR, problem_name, 'data')
        training_path = path.join(training_dir, file_name)

        points = JSONFile.read(training_path)
        if points is not None:
            return points

        points = DomainService.get_points_domain(n_training,
                                                 bounds_domain,
                                                 type_bounds=type_bounds,
                                                 random_seed=random_seed,
                                                 simplex_domain=simplex_domain)
        print(points)
        JSONFile.write(points, training_path)

        return points
Ejemplo n.º 12
0
    def save_model(self, sufix=None):
        stat_model_dict = {}
        stat_model_dict['current_point'] = self.current_point
        stat_model_dict['starting_point'] = self.starting_point
        stat_model_dict['current_batch_index'] = self.current_batch_index
        stat_model_dict['best_result'] = self.gp_model.best_result
        stat_model_dict['current_iteration'] = self.gp_model.current_iteration
        stat_model_dict['raw_results'] = self.gp_model.raw_results

        file_name = 'data/multi_start/stat_model'

        if self.problem_name is not None:
            file_name += '_' + self.problem_name

        if sufix is not None:
            file_name += '_' + sufix

        JSONFile.write(stat_model_dict, file_name + '.json')
Ejemplo n.º 13
0
    def load_discretization(cls, problem_name, bounds_domain_x,
                            number_points_each_dimension_x):
        """
        Try to load discretization for problem_name from file. If the file doesn't exist, will
        generate the discretization and store it.

        :param problem_name: (str)
        :param bounds_domain_x: ([BoundsEntity])
        :param number_points_each_dimension_x: ([int])

        :return: [[float]]
        """

        bounds_str = BoundsEntity.get_bounds_as_lists(bounds_domain_x)

        filename = cls._disc_x_filename(
            name=problem_name,
            bounds=bounds_str,
            number_points_each_dimension=number_points_each_dimension_x)

        if not os.path.exists(path.join(PROBLEM_DIR, problem_name)):
            os.mkdir(path.join(PROBLEM_DIR, problem_name))

        domain_dir = path.join(PROBLEM_DIR, problem_name, DOMAIN_DIR)

        if not os.path.exists(domain_dir):
            os.mkdir(domain_dir)

        domain_path = path.join(domain_dir, filename)

        discretization_data = JSONFile.read(domain_path)
        if discretization_data is not None:
            return discretization_data

        logger.info('Gnerating discretization of domain_x')
        discretization_data = DomainEntity.discretize_domain(
            bounds_domain_x, number_points_each_dimension_x)
        logger.info('Generated discretization of domain_x')

        JSONFile.write(discretization_data, domain_path)

        return discretization_data
Ejemplo n.º 14
0
    def write_debug_data(self, problem_name, model_type, training_name,
                         n_training, random_seed, n_samples_parameters,
                         **kwargs):
        """
        Write the results of the optimization.

        :param problem_name: (str)
        :param model_type: (str)
        :param training_name: (str)
        :param n_training: (int)
        :param random_seed: (int)
        :param n_samples_parameters: int
        """
        if not os.path.exists(DEBUGGING_DIR):
            os.mkdir(DEBUGGING_DIR)

        debug_dir = path.join(DEBUGGING_DIR, problem_name)

        if not os.path.exists(debug_dir):
            os.mkdir(debug_dir)

        kernel_name = ''
        for kernel in self.gp.type_kernel:
            kernel_name += kernel + '_'
        kernel_name = kernel_name[0:-1]

        f_name = self._filename(model_type=model_type,
                                problem_name=problem_name,
                                type_kernel=kernel_name,
                                training_name=training_name,
                                n_training=n_training,
                                random_seed=random_seed,
                                n_samples_parameters=n_samples_parameters)

        debug_path = path.join(debug_dir, f_name)

        JSONFile.write(self.optimization_results, debug_path)
    def accuracy(self, gp_model, start=3, iterations=21, sufix=None):
        means = []
        cis = []

        mean, std, ci = self.compute_posterior_params_marginalize(gp_model)
        means.append(mean)
        cis.append(ci)

        for i in range(start, iterations):
            print(i)
            if len(gp_model.raw_results) < i + 1:
                self.add_observations(gp_model, i + 1,
                                      self.get_value_next_iteration(i + 1))
            mean, std, ci = self.compute_posterior_params_marginalize(gp_model)
            means.append(mean)
            cis.append(ci)

        accuracy_results = {}
        accuracy_results['means'] = means
        accuracy_results['ci'] = cis
        file_name = 'data/multi_start/accuracy_results/parametric_model'

        if not os.path.exists('data/multi_start'):
            os.mkdir('data/multi_start')

        if not os.path.exists('data/multi_start/accuracy_results'):
            os.mkdir('data/multi_start/accuracy_results')

        if self.problem_name is not None:
            file_name += '_' + self.problem_name

        if sufix is not None:
            file_name += '_' + sufix

        JSONFile.write(accuracy_results, file_name + '.json')

        return means, cis
Ejemplo n.º 16
0
    def assign_categories_to_users(cls, year, month):
        file_name = cls._name_file_final(year=year, month=month)
        full_data = JSONFile.read(file_name)
        users = full_data[1]

        paper_cat = JSONFile.read(
            cls._name_file_categories(year=year, month=month))

        users_cg = {}
        for user in users:
            diff_papers = users[user]['diff_papers']
            papers_cat = []
            for paper in diff_papers:
                papers_cat.append(paper_cat[paper])
            users_cg[user] = papers_cat

        JSONFile.write(users_cg,
                       cls._name_file_categories_users(year=year, month=month))

        user_cat = {}
        for user in users_cg:
            papers = users_cg[user]
            cat_us = {}
            for cat in papers:
                if cat not in cat_us:
                    cat_us[cat] = 0
                cat_us[cat] += 1
            for cat in cat_us:
                if cat_us[cat] >= 0.10 * len(papers):
                    if cat not in user_cat:
                        user_cat[cat] = 0
                    user_cat[cat] += 1
        JSONFile.write(
            user_cat,
            cls._name_file_categories_users_hist(year=year, month=month))
        return users_cg, user_cat
from __future__ import absolute_import

from problems.cnn_cifar10.cnn import train_nn

import argparse
from stratified_bayesian_optimization.util.json_file import JSONFile

if __name__ == '__main__':
    # Example usage:
    # python -m problems.cnn_cifar10.scripts.run_cnn 1 1

    parser = argparse.ArgumentParser()
    parser.add_argument('random_seed', help='e.g. 2')
    parser.add_argument('n_epochs', help='e.g. 2')

    args = parser.parse_args()
    rs = int(args.random_seed)
    n_epochs = int(args.n_epochs)

    errors = train_nn(rs, n_epochs)

    directory = 'problems/cnn_cifar10/runs_random_seeds/' + 'rs_%d' % rs + '.json'

    JSONFile.write(errors, directory)
 def test_write(self):
     with patch('__builtin__.open', mock_open()):
         JSONFile.write([], self.filename)
Ejemplo n.º 19
0
    def get_gp(cls,
               name_model,
               problem_name,
               type_kernel,
               dimensions,
               bounds_domain,
               type_bounds=None,
               n_training=0,
               noise=False,
               training_data=None,
               points=None,
               training_name=None,
               mle=True,
               thinning=0,
               n_burning=0,
               max_steps_out=1,
               n_samples=None,
               random_seed=DEFAULT_RANDOM_SEED,
               kernel_values=None,
               mean_value=None,
               var_noise_value=None,
               cache=True,
               same_correlation=False,
               use_only_training_points=True,
               optimization_method=None,
               n_samples_parameters=0,
               parallel_training=True,
               simplex_domain=None,
               objective_function=None,
               define_samplers=True):
        """
        Fetch a GP model from file if it exists, otherwise train a new model and save it locally.

        :param name_model: str
        :param problem_name: str
        :param type_kernel: [(str)] Must be in possible_kernels. If it's a product of kernels it
            should be a list as: [PRODUCT_KERNELS_SEPARABLE, NAME_1_KERNEL, NAME_2_KERNEL]
        :param dimensions: [int]. It has only the n_tasks for the task_kernels, and for the
            PRODUCT_KERNELS_SEPARABLE contains the dimensions of every kernel in the product
        :param bounds_domain: [([float, float] or [float])], the first case is when the bounds are
            lower or upper bound of the respective entry; in the second case, it's list of finite
            points representing the domain of that entry.
        :param type_bounds: [0 or 1], 0 if the bounds are lower or upper bound of the respective
            entry, 1 if the bounds are all the finite options for that entry.
        :param n_training: int
        :param noise: (boolean) If true, we get noisy evaluations.
        :param training_data: {'points': [[float]], 'evaluations': [float],
            'var_noise': [float] or None}
        :param points: [[float]]. If training_data is None, we can evaluate the objective
            function in these points.
        :param training_name: (str), prefix used to save the training data.
        :param mle: (boolean) If true, fits the GP by MLE.
        :param thinning: (int)
        :param n_burning: (int) Number of burnings samples for the MCMC.
        :param max_steps_out: (int)  Maximum number of steps out for the stepping out  or
                doubling procedure in slice sampling.
        :param n_samples: (int) If the objective is noisy, we take n_samples of the function to
            estimate its value.
        :param random_seed: (int)
        :param kernel_values: [float], contains the default values of the parameters of the kernel
        :param mean_value: [float], It contains the value of the mean parameter.
        :param var_noise_value: [float], It contains the variance of the noise of the model
        :param cache: (boolean) Try to get model from cache
        :param same_correlation: (boolean) If true, it uses the same correlations for the task
            kernel.
        :param use_only_training_points (boolean) If the model is read, and the param is true,
            it uses only the training points in data. Otherwise, it also includes new points
            previously computed.
        :param optimization_method: (str)
        :param n_samples_parameters: (int)
        :param parallel_training: (boolean)
        :param define_samplers: (boolean) If False, samplers for the hyperparameters are not
            defined.

        :return: (GPFittingGaussian) - An instance of GPFittingGaussian
        """
        model_type = cls._model_map[name_model]

        if training_name is None:
            training_name = 'default_training_data_%d_points_rs_%d' % (
                n_training, random_seed)

        if use_only_training_points:
            f_name = cls._get_filename(model_type, problem_name, type_kernel,
                                       training_name)
            f_name_cache = cls._get_filename_modified(model_type, problem_name,
                                                      type_kernel,
                                                      training_name,
                                                      optimization_method,
                                                      n_samples_parameters)
        else:
            f_name = cls._get_filename_modified(model_type, problem_name,
                                                type_kernel, training_name,
                                                optimization_method,
                                                n_samples_parameters)

        if not os.path.exists('data'):
            os.mkdir('data')

        if not os.path.exists(GP_DIR):
            os.mkdir(GP_DIR)

        gp_dir = path.join(GP_DIR, problem_name)

        if not os.path.exists(gp_dir):
            os.mkdir(gp_dir)

        gp_path = path.join(gp_dir, f_name)

        gp_path_cache = path.join(gp_dir, f_name_cache)

        if cache:
            data = JSONFile.read(gp_path)
            data = None
        else:
            data = None

        if data is not None:
            return model_type.deserialize(
                data, use_only_training_points=use_only_training_points)

        if training_data is None or training_data == {}:
            training_data = TrainingDataService.get_training_data(
                problem_name,
                training_name,
                bounds_domain,
                n_training=n_training,
                points=points,
                noise=noise,
                n_samples=n_samples,
                random_seed=random_seed,
                type_bounds=type_bounds,
                cache=cache,
                parallel=parallel_training,
                gp_path_cache=gp_path_cache,
                simplex_domain=simplex_domain,
                objective_function=objective_function)

        logger.info("Training %s" % model_type.__name__)

        gp_model = model_type.train(type_kernel,
                                    dimensions,
                                    mle,
                                    training_data,
                                    bounds_domain,
                                    thinning=thinning,
                                    n_burning=n_burning,
                                    max_steps_out=max_steps_out,
                                    random_seed=random_seed,
                                    type_bounds=type_bounds,
                                    training_name=training_name,
                                    problem_name=problem_name,
                                    kernel_values=kernel_values,
                                    mean_value=mean_value,
                                    var_noise_value=var_noise_value,
                                    same_correlation=same_correlation,
                                    simplex_domain=simplex_domain,
                                    define_samplers=define_samplers)

        JSONFile.write(gp_model.serialize(), gp_path)

        return gp_model
Ejemplo n.º 20
0
from stratified_bayesian_optimization.kernels.matern52 import Matern52
from stratified_bayesian_optimization.lib.sample_functions import SampleFunctions

decimals = 10
random_seed = 5
np.random.seed(random_seed)
n_points = 1000
points = np.linspace(0, 100, n_points)
points = np.round(points, decimals=decimals)
points = points.reshape([n_points, 1])

tasks = np.array([[0, 1]])

add = [10, -10]
kernel = Matern52.define_kernel_from_array(1, np.array([100.0, 1.0]))
function = SampleFunctions.sample_from_gp(points, kernel)
function = function[0, :]

final_function = {}

for task in range(2):
    final_function[task] = []
    for i in xrange(n_points):
        point = np.concatenate((points[i, :], np.array([task])))
        final_function[task].append(function[i] + add[task])

filename = path.join('problems', 'test_simulated_gp',
                     'simulated_function_with_%d_%d' % (n_points, random_seed))

JSONFile.write({'function': final_function, 'points': points}, filename)
Ejemplo n.º 21
0
    def generate_evaluations(self,
                             problem_name,
                             model_type,
                             training_name,
                             n_training,
                             random_seed,
                             iteration,
                             n_points_by_dimension=None,
                             n_tasks=0):
        """
        Generates evaluations of SBO, and write them in the debug directory.

        :param problem_name: (str)
        :param model_type: (str)
        :param training_name: (str)
        :param n_training: (int)
        :param random_seed: (int)
        :param iteration: (int)
        :param n_points_by_dimension: [int] Number of points by dimension
        :param n_tasks: (int) n_tasks > 0 if the last element of the domain is a task

        """

        if not os.path.exists(DEBUGGING_DIR):
            os.mkdir(DEBUGGING_DIR)

        debug_dir = path.join(DEBUGGING_DIR, problem_name)

        if not os.path.exists(debug_dir):
            os.mkdir(debug_dir)

        kernel_name = ''
        for kernel in self.gp.type_kernel:
            kernel_name += kernel + '_'
        kernel_name = kernel_name[0:-1]

        f_name = self._filename_points_ei_evaluations(
            model_type=model_type,
            problem_name=problem_name,
            type_kernel=kernel_name,
            training_name=training_name,
            n_training=n_training,
            random_seed=random_seed)

        debug_path = path.join(debug_dir, f_name)

        vectors = JSONFile.read(debug_path)

        if vectors is None:
            bounds = self.gp.bounds
            n_points = n_points_by_dimension
            if n_points is None:
                n_points = (bounds[0][1] - bounds[0][0]) * 10

            if n_tasks > 0:
                bounds_x = [bounds[i] for i in xrange(len(bounds) - 1)]
                n_points_x = [n_points[i] for i in xrange(len(n_points))]
            else:
                n_points_x = n_points
                bounds_x = bounds

            points = []
            for bound, number_points in zip(bounds_x, n_points_x):
                points.append(np.linspace(bound[0], bound[1], number_points))

            vectors = []
            for point in itertools.product(*points):
                vectors.append(point)

            JSONFile.write(vectors, debug_path)

        n = len(vectors)
        points_ = deepcopy(vectors)

        vectors = np.array(vectors)

        if n_tasks > 0:
            vectors_ = None
            for i in xrange(n_tasks):
                task_vector = np.zeros(n) + i
                task_vector = task_vector.reshape((n, 1))
                points_ = np.concatenate((vectors, task_vector), axis=1)

                if vectors_ is not None:
                    vectors_ = np.concatenate((vectors_, points_), axis=0)
                else:
                    vectors_ = points_
            vectors = vectors_

        # TODO: extend to the case where w can be continuous

        n = vectors.shape[0]

        points = {}
        for i in xrange(n):
            points[i] = vectors[i, :]

        args = (
            False,
            None,
            False,
            0,
            self,
        )
        val = Parallel.run_function_different_arguments_parallel(
            wrapper_objective_acquisition_function, points, *args)

        values = np.zeros(n)
        for i in xrange(n):
            values[i] = val.get(i)

        f_name = self._filename_ei_evaluations(iteration=iteration,
                                               model_type=model_type,
                                               problem_name=problem_name,
                                               type_kernel=kernel_name,
                                               training_name=training_name,
                                               n_training=n_training,
                                               random_seed=random_seed)

        debug_path = path.join(debug_dir, f_name)

        JSONFile.write({'points': points_, 'evaluations': values}, debug_path)

        return values
Ejemplo n.º 22
0
    rs_2 = len(spec_2['random_seeds'])

    for key in keys:
        values_1 = None
        values_2 = None
        if key in spec_1:
            values_1 = spec_1[key]
        if key in spec_2:
            values_2 = spec_2[key]
        if values_1 is None:
            values_1 = rs_1 * [None]

        if values_2 is None:
            values_2 = rs_2 * [None]

        new_spec[key] = []
        for i in xrange(max(len(values_1), len(values_2))):
            if i < len(values_1):
                value_1 = values_1[i]
                new_spec[key] += [value_1]
            if i < len(values_2):
                value_2 = values_2[i]
                new_spec[key] += [value_2]

    # for key in spec_1:
    #     value_1 = spec_1[key]
    #     value_2 = spec_2[key]
    #     new_spec[key] = value_1 + value_2

    JSONFile.write(new_spec, path.join(MULTIPLESPECS_DIR, output))
    def get_click_data(cls, filenames, store_filename):
        """
          Get click data from filenames. Writes a JSON file with the format:

          {
               'cookie_hash': {'arxiv_id'}
          }

          :param filenames: [str]
          :param store_filename: str

          """
        paper = {}

        process_data = {}

        process_files = []
        store_files = "problems/arxiv/data/store_files.json"

        for filename in filenames:
            logger.info("Processing filename: %s" % filename)

            f = gzip.open(filename, 'rb')

            data = json.load(f)
            entries = data['entries']

            for entry in entries:
                if 'arxiv_id' in entry and 'cookie_hash' in entry:
                    before_2007 = False
                    arxiv_id = entry['arxiv_id']

                    # if '/' in arxiv_id:
                    #      before_2007 = True
                    #      index = arxiv_id.index('/')
                    #      cat = arxiv_id[0: index]
                    #      arxiv_id = arxiv_id[index + 1:]
                    #
                    # if 'v' in arxiv_id:
                    #     index = arxiv_id.rfind('v')
                    #     arxiv_id = arxiv_id[0: index]
                    #
                    #
                    user = entry['cookie_hash']
                    #
                    # if arxiv_id not in paper:
                    #      if not before_2007:
                    #           cat = cls.get_cats(arxiv_id, arxiv_id[0: 2], arxiv_id[2: 4])

                    if arxiv_id not in paper:
                        paper[arxiv_id] = {'views': 0}

                    paper[arxiv_id]['views'] += 1

                    if user not in process_data:
                        process_data[user] = {}
                        process_data[user][arxiv_id] = 0
                    elif arxiv_id not in process_data[user]:
                        process_data[user][arxiv_id] = 0
                    process_data[user][arxiv_id] += 1

            process_files.append(filename[22:28])
            JSONFile.write(process_files, store_files)

            JSONFile.write([process_data, paper], store_filename)

        JSONFile.write([process_data, paper], store_filename)
Ejemplo n.º 24
0
def SGD(start,
        gradient,
        n,
        function,
        exact_gradient=None,
        args=(),
        kwargs={},
        bounds=None,
        learning_rate=0.1,
        momentum=0.0,
        maxepoch=250,
        adam=True,
        betas=None,
        eps=1e-8,
        simplex_domain=None,
        name_model='1',
        method='real_gradient',
        n_epochs=1,
        n_samples=100,
        gradient_samples=None,
        problem=None,
        exact_objective=None):
    """
    SGD to minimize sum(i=0 -> n) (1/n) * f(x). Batch sizes are of size 1.
    ADAM: https://arxiv.org/pdf/1412.6980.pdf
    :param start: np.array(n)
    :param gradient:
    :param n:
    :param learning_rate:
    :param momentum:
    :param maxepoch:
    :param args: () arguments for the gradient
    :param kwargs:
    :param bounds: [(min, max)] for each point
    :return: np.array(n)
    """
    values = []
    points = []

    gradients = []
    exact_values = []
    stochastic_gradients = []

    if method == 'grad_epoch':
        gradients = {}

    gradient_batch = []
    # points.append(np.array(start))
    # values.append(function(start))

    logger.info('start_value')
    logger.info(function(start))

    # if exact_gradient is not None and method == 'real_gradient':
    #     gradients.append(exact_gradient(start))

    project = False
    if bounds is not None or simplex_domain is not None:
        project = True

    if betas is None:
        betas = (0.9, 0.999)

    m0 = np.zeros(len(start))
    v0 = np.zeros(len(start))

    point = start
    v = np.zeros(len(start))
    times_out_boundary = 0
    t_ = 0

    lr = learning_rate

    for iteration in xrange(maxepoch):
        learning_rate = lr / float(iteration + 1)
        previous = point.copy()
        t_ += 1
        grad = []

        for j in xrange(n):
            gradient_ = gradient(point, *args, **kwargs)

            while gradient_ is np.nan:
                norm_point = np.sqrt(np.sum(point**2))
                perturbation = norm_point * 1e-6

                if project:
                    parameters_uniform = []
                    for i in range(len(bounds)):
                        bound = bounds[i]
                        dist = point[i] - bound[0]
                        lb = min(perturbation, dist)
                        dist = bound[1] - point[i]
                        ub = min(perturbation, dist)
                        parameters_uniform.append([-lb, ub])
                else:
                    parameters_uniform = len(point) * [[
                        -perturbation, perturbation
                    ]]

                perturbation = []
                for i in range(len(point)):
                    lb = parameters_uniform[i][0]
                    ub = parameters_uniform[i][1]
                    perturbation.append(np.random.uniform(lb, ub))
                perturbation = np.array(perturbation)
                point = point + perturbation
                gradient_ = gradient(point, *args, **kwargs)
            grad.append(gradient_)
        gradient_ = np.mean(np.array(grad), axis=0)
        stochastic_gradients.append(gradient_)

        if exact_gradient is not None and method == 'real_gradient':
            gradients.append(exact_gradient(point))
        points.append(np.array(point))
        values.append(function(point))

        if exact_objective is not None:
            exact_values.append(exact_objective(point))

        if not adam:
            v = momentum * v + gradient_
            old_p = point.copy()
            point -= learning_rate * v
        else:
            m0 = betas[0] * m0 + (1 - betas[0]) * gradient_
            v0 = betas[1] * v0 + (1 - betas[1]) * (gradient_**2)
            m_1 = m0 / (1 - (betas[0])**(t_))
            v_1 = v0 / (1 - (betas[1])**(t_))
            point = point - learning_rate * m_1 / (np.sqrt(v_1) + eps)

        in_domain = True
        if project:
            for dim, bound in enumerate(bounds):
                if bound[0] is not None and point[dim] < bound[0]:
                    in_domain = False
                    break
                if bound[1] is not None and point[dim] > bound[1]:
                    in_domain = False
                    break
                if simplex_domain is not None:
                    if np.sum(point) > simplex_domain:
                        in_domain = False
                        break
                    #TODO:Only for citibike, generalize later
                    if simplex_domain - np.sum(point) > 3717.0:
                        in_domain = False
                        break

        if project and not in_domain:
            for dim, bound in enumerate(bounds):
                if bound[0] is not None:
                    point[dim] = max(bound[0], point[dim])
                if bound[1] is not None:
                    point[dim] = min(bound[1], point[dim])
            if simplex_domain is not None:
                if np.sum(point) > simplex_domain:
                    point = simplex_domain * (point / np.sum(point))

                if simplex_domain - np.sum(point) > 3717.0:
                    point = (simplex_domain - 3717.0) * (point / np.sum(point))
            if not adam:
                for dim, bound in enumerate(bounds):
                    v[dim] = (point[dim] - old_p[dim]) / learning_rate

        # points.append(np.array(point))
        # values.append(function(point))

        #    gradients.append(np.array(gradient_))

    if exact_gradient is not None and method == 'real_gradient':
        gradients.append(exact_gradient(point))
    points.append(np.array(point))
    values.append(function(point))

    if exact_objective is not None:
        exact_values.append(exact_objective(point))

    gradient_ = np.array(gradient(point, *args, **kwargs))
    stochastic_gradients.append(gradient_)

    if method == 'grad_epoch':
        for iteration in range(maxepoch):
            if iteration % n_epochs == (n_epochs - 1):
                gradients[iteration] = gradient_samples(
                    points[iteration], n_samples)

    results = {
        'points': points,
        'values': values,
        'gradients': gradients,
        'n_epochs': n_epochs,
        'stochastic_gradients': stochastic_gradients,
        'exact_values': exact_values
    }

    f_name = 'data/multi_start/analytic_example/training_results/'

    if not os.path.exists('data/multi_start'):
        os.mkdir('data/multi_start')
    if not os.path.exists('data/multi_start/' + problem):
        os.mkdir('data/multi_start/' + problem)
    f_name = 'data/multi_start/' + problem + '/'
    if not os.path.exists(f_name + 'training_results'):
        os.mkdir(f_name + 'training_results')
    f_name += 'training_results' + '/' + name_model

    print "optimal_value!!!"
    print exact_values[-1]
    JSONFile.write(results, f_name)

    return results
    def get_training_data(cls,
                          problem_name,
                          training_name,
                          bounds_domain,
                          n_training=5,
                          points=None,
                          noise=False,
                          n_samples=None,
                          random_seed=DEFAULT_RANDOM_SEED,
                          parallel=True,
                          type_bounds=None,
                          cache=True,
                          gp_path_cache=None,
                          simplex_domain=None,
                          objective_function=None):
        """

        :param problem_name: str
        :param training_name: (str), prefix used to save the training data.
        :param bounds_domain: [([float, float] or [float])], the first case is when the bounds are
            lower or upper bound of the respective entry; in the second case, it's list of finite
            points representing the domain of that entry.
        :param n_training: (int), number of training points if points is None
        :param points: [[float]]
        :param noise: boolean, true if the evaluations are noisy
        :param n_samples: int. If noise is true, we take n_samples of the function to estimate its
            value.
        :param random_seed: int
        :param parallel: (boolean) Train in parallel if it's True.
        :param type_bounds: [0 or 1], 0 if the bounds are lower or upper bound of the respective
            entry, 1 if the bounds are all the finite options for that entry.
        :param cache: (boolean) Try to get model from cache
        :return: {'points': [[float]], 'evaluations': [float], 'var_noise': [float] or []}
        """

        if cache and gp_path_cache is not None:
            data = JSONFile.read(gp_path_cache)
            if data is not None:
                return data['data']

        logger.info("Getting training data")

        rs = random_seed
        if points is not None and len(points) > 0:
            n_training = len(points)
            rs = 0

        file_name = cls._filename(
            problem_name=problem_name,
            training_name=training_name,
            n_points=n_training,
            random_seed=rs,
        )

        if not os.path.exists(PROBLEM_DIR):
            os.mkdir(PROBLEM_DIR)

        training_dir = path.join(PROBLEM_DIR, problem_name, 'data')

        if not os.path.exists(path.join(PROBLEM_DIR, problem_name)):
            os.mkdir(path.join(PROBLEM_DIR, problem_name))

        if not os.path.exists(training_dir):
            os.mkdir(training_dir)

        training_path = path.join(training_dir, file_name)

        if cache:
            training_data = JSONFile.read(training_path)
        else:
            training_data = None

        if training_data is not None:
            return training_data

        if n_training == 0:
            return {'points': [], 'evaluations': [], 'var_noise': []}

        np.random.seed(random_seed)

        if points is None or len(points) == 0:
            points = cls.get_points_domain(n_training,
                                           bounds_domain,
                                           random_seed,
                                           training_name,
                                           problem_name,
                                           type_bounds,
                                           simplex_domain=simplex_domain)

        if objective_function is None:
            name_module = cls.get_name_module(problem_name)
            module = __import__(name_module, globals(), locals(), -1)
        else:
            name_module = None
            module = None

        training_data = {}
        training_data['points'] = points
        training_data['evaluations'] = []
        training_data['var_noise'] = []

        if not parallel:
            for point in points:
                if noise:
                    if module is not None:
                        evaluation = cls.evaluate_function(
                            module, point, n_samples)
                    else:
                        evaluation = objective_function(point, n_samples)
                    training_data['var_noise'].append(evaluation[1])
                else:
                    if module is not None:
                        evaluation = cls.evaluate_function(module, point)
                    else:
                        evaluation = objective_function(point)
                training_data['evaluations'].append(evaluation[0])
                JSONFile.write(training_data, training_path)
            JSONFile.write(training_data, training_path)
            return training_data

        arguments = convert_list_to_dictionary(points)

        if name_module is not None:
            kwargs = {
                'name_module': name_module,
                'cls_': cls,
                'n_samples': n_samples
            }
        else:
            kwargs = {
                'name_module': None,
                'cls_': cls,
                'n_samples': n_samples,
                'objective_function': objective_function
            }

        training_points = Parallel.run_function_different_arguments_parallel(
            wrapper_evaluate_objective_function, arguments, **kwargs)

        training_points = convert_dictionary_to_list(training_points)

        training_data['evaluations'] = [value[0] for value in training_points]

        if noise:
            training_data['var_noise'] = [
                value[1] for value in training_points
            ]

        if cache:
            JSONFile.write(training_data, training_path)

        return training_data
Ejemplo n.º 26
0
    def cv_data_sets(cls, year, month, n_folds=5, random_seed=1):
        """
        Creates n_folds files with pairs of datasets: (training_data, validation_data).

        :param year: str
        :param month: str (e.g. '1', '12')
        :param n_folds: int
        :param random_seed: int

        """
        random.seed(random_seed)

        file_name = cls._name_training_data(year=year, month=month)
        data = JSONFile.read(file_name)

        indexes_data = range(len(data))
        random.shuffle(indexes_data)

        n_batch = len(indexes_data) / n_folds
        random_indexes = [
            indexes_data[i * n_batch:n_batch + i * n_batch]
            for i in xrange(n_folds)
        ]

        extra = 0
        for j in xrange(len(indexes_data) % n_folds):
            random_indexes[j].append(indexes_data[n_batch + extra +
                                                  (n_folds - 1) * n_batch])
            extra += 1

        file_name = cls._name_fold_indexes(year=year, month=month)
        JSONFile.write(random_indexes, file_name)

        for i in xrange(n_folds):
            validation = [data[index] for index in random_indexes[i]]

            training_indexes = []
            for j in xrange(n_folds):
                if j != i:
                    training_indexes += random_indexes[j]

            training = [data[index] for index in training_indexes]

            file_name = cls._name_fold_data_training(year=year,
                                                     month=month,
                                                     fold=i)
            JSONFile.write(training, file_name)

            file_name = cls._name_fold_data_training_matlab(year=year,
                                                            month=month,
                                                            fold=i)
            sio.savemat(file_name, {'training': training})

            file_name = cls._name_fold_data_validation(year=year,
                                                       month=month,
                                                       fold=i)
            JSONFile.write(validation, file_name)

            file_name = cls._name_fold_data_validation_matlab(year=year,
                                                              month=month,
                                                              fold=i)
            sio.savemat(file_name, {'validation': validation})
    def collect_multi_spec_results(cls,
                                   multiple_spec,
                                   total_iterations=None,
                                   sign=True,
                                   sqr=False,
                                   same_random_seeds=False,
                                   rs_lw=0,
                                   rs_up=None):
        """
        Writes the files with the aggregated results
        :param multiple_spec:
        :param total_iterations: (int) Collect results until this iteration
        :param sign: (boolean) If true, we multiply the results by -1
        :param sqr: (boolean) If true, we take the square root of the results
        :param same_random_seeds: (boolean) If true, we use the same random seeds for both problems
        :return:
        """

        if total_iterations is None:
            total_iterations = 10000

        n_specs = len(multiple_spec.get('random_seeds'))

        results_dict = {}

        if sign:
            sign = -1.0
        else:
            sign = 1.0

        if sqr:
            f = lambda x: x**0.5
        else:
            f = lambda x: x

        if rs_up is not None:
            same_random_seeds = True

        if same_random_seeds:
            random_seeds = {}
            for method in set(multiple_spec.get('method_optimizations')):
                random_seeds[method] = []
            for i in xrange(n_specs):
                problem_name = multiple_spec.get('problem_names')[i]
                dir = path.join(PROBLEM_DIR, problem_name, PARTIAL_RESULTS)

                if not os.path.exists(dir):
                    continue

                training_name = multiple_spec.get('training_names')[i]
                n_training = multiple_spec.get('n_trainings')[i]
                random_seed = multiple_spec.get('random_seeds')[i]
                method = multiple_spec.get('method_optimizations')[i]
                n_samples_parameters = multiple_spec.get(
                    'n_samples_parameterss')[i]
                n_iterations = multiple_spec.get('n_iterationss')[i]

                file_name = cls._filename_results(
                    problem_name=problem_name,
                    training_name=training_name,
                    n_points=n_training,
                    random_seed=random_seed,
                    method=method,
                    n_samples_parameters=n_samples_parameters,
                )

                file_path = path.join(dir, file_name)
                if not os.path.exists(file_path):
                    continue
                random_seeds[method].append(random_seed)

            methods = list(set(multiple_spec.get('method_optimizations')))
            random_seeds_check = set(random_seeds[methods[0]])
            for i in xrange(1, len(methods)):
                random_seeds_check = random_seeds_check.intersection(
                    random_seeds[methods[i]])

            if rs_up is not None:
                random_seeds_check = random_seeds_check.intersection(
                    range(rs_lw, rs_up))

        for i in xrange(n_specs):
            problem_name = multiple_spec.get('problem_names')[i]
            dir = path.join(PROBLEM_DIR, problem_name, PARTIAL_RESULTS)

            if not os.path.exists(dir):
                continue

            training_name = multiple_spec.get('training_names')[i]
            n_training = multiple_spec.get('n_trainings')[i]
            random_seed = multiple_spec.get('random_seeds')[i]
            method = multiple_spec.get('method_optimizations')[i]
            n_samples_parameters = multiple_spec.get(
                'n_samples_parameterss')[i]
            n_iterations = multiple_spec.get('n_iterationss')[i]

            if same_random_seeds and random_seed not in random_seeds_check:
                continue

            file_name = cls._filename_results(
                problem_name=problem_name,
                training_name=training_name,
                n_points=n_training,
                random_seed=random_seed,
                method=method,
                n_samples_parameters=n_samples_parameters,
            )

            file_path = path.join(dir, file_name)

            if not os.path.exists(file_path):
                continue

            results = JSONFile.read(file_path)
            results = results['objective_values']

            key_dict = (problem_name, training_name, n_training, method)
            if key_dict not in results_dict:
                results_dict[key_dict] = \
                    [[] for _ in range(min(n_iterations + 1, total_iterations))]

            for iteration in range(
                    min(total_iterations, n_iterations + 1, len(results))):
                results_dict[key_dict][iteration].append(
                    f(sign * results[iteration]))

        problem_names = list(set(multiple_spec.get('problem_names')))
        training_names = set(multiple_spec.get('training_names'))
        n_trainings = set(multiple_spec.get('n_trainings'))
        methods = set(multiple_spec.get('method_optimizations'))

        aggregated_results = {}

        for problem in problem_names:
            for training in training_names:
                for n_training in n_trainings:
                    for method in methods:

                        key = (problem, training, n_training, method)
                        aggregated_results[key] = {}

                        if key not in results_dict:
                            continue

                        results = results_dict[key]

                        for iteration in xrange(
                                min(len(results), total_iterations)):
                            if len(results[iteration]) > 0:
                                values = results[iteration]
                                mean = np.mean(values)
                                std = np.std(values)
                                n_samples = len(results[iteration])
                                ci_low = mean - 1.96 * std / np.sqrt(n_samples)
                                ci_up = mean + 1.96 * std / np.sqrt(n_samples)

                                aggregated_results[key][iteration] = {}
                                aggregated_results[key][iteration][
                                    'mean'] = mean
                                aggregated_results[key][iteration]['std'] = std
                                aggregated_results[key][iteration][
                                    'n_samples'] = n_samples
                                aggregated_results[key][iteration][
                                    'ci_low'] = ci_low
                                aggregated_results[key][iteration][
                                    'ci_up'] = ci_up
                            else:
                                break

                        if len(aggregated_results[key]) > 0:
                            dir = path.join(PROBLEM_DIR, problem,
                                            AGGREGATED_RESULTS)

                            if not os.path.exists(dir):
                                os.mkdir(dir)

                            file_name = cls._aggregated_results(
                                problem_name=problem,
                                training_name=training,
                                n_points=n_training,
                                method=method,
                            )

                            file_path = path.join(dir, file_name)
                            JSONFile.write(aggregated_results[key], file_path)
Ejemplo n.º 28
0
 def test_write_debug_data(self, mock_mkdir, mock_exists):
     mock_exists.return_value = False
     with patch('__builtin__.open', mock_open()):
         self.gp.write_debug_data("a", "b", "c", "d", "e")
         JSONFile.write([], "a")
     mock_mkdir.assert_called_with('data/debugging/a')
Ejemplo n.º 29
0
    def top_users_papers(cls,
                         year,
                         month,
                         n_entries=100,
                         different_papers=20,
                         top_n=5000,
                         n_users=None,
                         only_assign_categories=True):
        """
        Returns the users that accessed to at least n_entries papers, and at least different_papers
        were different and were in the top_n papers in the month of the year.

        Returns the top_n papers based on how many times they were seen.

        :param year: (str)
        :param month: (str) e.g. '1', '12'
        :param n_entries: (int)
        :param different_papers: int
        :param top_n: int
        :param n_users: (int) Maximum number of users allowed
        :return: [ {'paper': (int) number of times seen},
            {'user': {'stats': ((int) # entries, (int) # different papers in the top_n papers),
                      'diff_papers': [str]
                }
            }
        ]
        """

        file_name = cls._name_file_(year=year, month=month)
        data = JSONFile.read(file_name)

        users = data[0]
        papers = data[1]

        n_papers = []
        paper_ls = []
        for paper in papers:
            paper_ls.append(paper)
            n_papers.append(papers[paper]['views'])
        index_top_papers = sorted(range(len(n_papers)),
                                  key=lambda k: n_papers[k])
        index_top_papers = index_top_papers[-top_n:]

        rank_papers = {}
        for index in index_top_papers:
            rank_papers[paper_ls[index]] = n_papers[index]

        paper_ls = rank_papers.keys()

        cls.assign_categories(paper_ls)

        if only_assign_categories:
            return

        rank_user = {}

        users_ls = []
        n_entries_ls = []

        for user in users:
            users_ls.append(user)
            n_entries_ls.append(sum(users[user].values()))

        index_top_users = sorted(range(len(n_entries_ls)),
                                 key=lambda k: n_entries_ls[k])
        users_ls = [users_ls[i] for i in index_top_users]
        n_entries_ls = [n_entries_ls[i] for i in index_top_users]
        ind_bis = bisect_left(n_entries_ls, n_entries)

        users_ls = users_ls[ind_bis:]
        n_entries_ls = n_entries_ls[ind_bis:]

        final_users = []
        metric_users = []
        for user, n in zip(users_ls, n_entries_ls):
            diff_papers = set(users[user].keys()).intersection(set(paper_ls))
            n_diff = len(diff_papers)
            if n_diff < different_papers:
                continue
            final_users.append(user)
            metric_users.append(n_diff)
            rank_user[user] = {
                'stats': (n, n_diff),
                'diff_papers': diff_papers
            }

        index_top_users = sorted(range(len(final_users)),
                                 key=lambda k: metric_users[k])

        if n_users is not None and len(index_top_users) > n_users:
            index_top_users = index_top_users[-n_users:]

            rank_user_final = {}
            for ind in index_top_users:
                rank_user_final[final_users[ind]] = rank_user[final_users[ind]]
            rank_user = rank_user_final

        file_name = cls._name_file_final(year=year, month=month)
        JSONFile.write([rank_papers, rank_user], file_name)

        logger.info('Number of papers is %d' % len(rank_papers))
        logger.info('Number of users is %d' % len(rank_user))

        return [rank_papers, rank_user]