Exemple #1
0
    def _main_loop(self, ref_set, scenario_builder, population):
        stop = False
        last_changed = 0
        iteration = 0
        while not stop:
            start_loop = datetime.now()
            # create the pool from combining solutions from ref_set
            logging.info("Performing combinations")
            start = datetime.now()
            pool = self._combine(ref_set, scenario_builder)
            logging.info("Combinations %s" % (datetime.now() - start))

            # improve pool
            logging.info("Improving best combinations")
            start = datetime.now()
            pool = self._mp_improve(pool, scenario_builder)
            logging.info("Improvements %s" % (datetime.now() - start))

            # join ref_set and pool together
            union = deepcopy(ref_set)
            union.add_container(pool)
            union.sort()

            new_ref_set = self._ref_set_update(union)
            if ref_set.same(new_ref_set):
                logging.info("Ref_set not changed")
                new_ref_set = ScatterPhenoScenarioContainer()

                for i in range(self._b/2):
                    new_ref_set.add(union.get(i))

                # get the most diverse solutions to what
                # we already have in ref_set
                while new_ref_set.len() < self._b:
                    new_ref_set.add(population.get_diverse(new_ref_set))

            if ref_set.same(new_ref_set):
                last_changed += 1
            else:
                last_changed = 0

            if last_changed >= 5:
                logging.info("Reached optimal solution, terminating...")
                stop = True

            if os.path.exists('/home/eey9/.stop_scatter_search'):
                stop = True
                logging.info("Stopping because of file flag...")

            ref_set = new_ref_set
            iteration += 1
            logging.info("Completed iteration %d" % iteration)
            self._report(ref_set, scenario_builder)
            t_delta = datetime.now() - start_loop
            logging.info("Iteration time %s" % t_delta)

        return
Exemple #2
0
    def _ref_set_update(self, source):
        source.sort()
        ref_set = ScatterPhenoScenarioContainer()
        for i in range(self._b/2):
            ref_set.add(source.get(i))

        # get the most diverse solutions to what we already have in ref_set
        while ref_set.len() < self._b:
            ref_set.add(source.get_diverse(ref_set))

        return ref_set
Exemple #3
0
    def _improve(self, individual, scenario_builder):
        start = datetime.now()
        base = importr("base")
        candidate_list = self._build_candidate_list(individual)

        improvements = ScatterPhenoScenarioContainer()

        for var in candidate_list:
            new_scenario = scenario_builder.flip(individual, var)
            if new_scenario.same(individual) or \
                    not new_scenario.valid(process=True):
                continue

            new_scenario = self._evaluate(new_scenario, base)

            if not improvements.contains(new_scenario):
                improvements.add(new_scenario)

        for i in range(len(individual.get_solution())):
            for j in range(i + 1, len(individual.get_solution())):
                new_scenario = scenario_builder.swap(individual, i, j)
                if new_scenario.same(individual) or \
                        not new_scenario.valid(process=True):
                    continue

                new_scenario = self._evaluate(new_scenario, base)
                if not improvements.contains(new_scenario):
                    improvements.add(new_scenario)

                if not self._database.contains(new_scenario):
                    self._database.add(new_scenario)

        improvements.sort()

        logging.info("self._improve finished - %s" %
                     (datetime.now() - start))
        return {'individual': individual,
                'improvements': improvements}
Exemple #4
0
    def _combine(self, container, scenario_builder):
        # build subsets
        combinations = self._build_combinations(container)
        pool = ScatterPhenoScenarioContainer()
        for combination in combinations:
            start = datetime.now()
            try:
                new_scenario = scenario_builder.combine(combination[0],
                                                        combination[1],
                                                        self._score_table)
            except NoValidSolutionException:
                logging.info("Combination %d/%d - %s: no valid solution" %
                             (combinations.index(combination) + 1,
                              len(combinations),
                              (datetime.now() - start)))
                continue

            self._update_score_table(new_scenario)
            if not pool.contains(new_scenario):
                pool.add(new_scenario)

            # see where does the scenario qualify to be in container
            try:
                j = container.index(next(x for x in container.get_all()
                                    if new_scenario.get_utility() <
                                    x.get_utility()))
                scenario_builder.success(self._b - j)
            except StopIteration:
                continue  # Worse than anything in ref_set, does not qualify

            logging.info("Combination %d/%d - %s" %
                         (combinations.index(combination) + 1,
                          len(combinations),
                          (datetime.now() - start)))

        return pool
    def g1(self, count):
        done = False
        iterations = 0
        while not done:
            solutions = ScatterPhenoScenarioContainer()
            seed_solution = self._random_solution()
            solutions.add(seed_solution)

            h_max = self._n

            # maximum number of solutions is h_max - 1
            if count > h_max - 1:
                raise Exception("Could not generate %d solutions with "
                                "G1 and h_max %d" % (count, h_max))

            for h in range(2, h_max):
                new_solution = deepcopy(seed_solution)
                index = 0
                while index < self._n:
                    new_solution.toggle(index)
                    index += h

                if solutions.len() < count and \
                        new_solution.valid(process=self._process):
                    solutions.add(new_solution)
                else:
                    break

            if solutions.len() == count:
                done = True
            else:
                iterations += 1

            if iterations > 10:
                raise Exception("Could not generate enough valid solutions")

        return solutions
Exemple #6
0
    def __init__(self, data, scenario, root_dir, load=False):
        """data - {'cd_data': [], 'ml_data': []}
        scenario - 'simple_ml' or 'process_ml'
        """

        # hardcoded algorithm variables, could supply them to the
        # constructor if needed
        # self._PSize = 45 TODO real value
        self._PSize = 12

        # weight for previous score entry, when updating the score table
        self._alpha = 0.3

        # self._b = 20 TODO real value
        self._b = 8

        self._proc_count = 4

        # set class variables
        self._variables = [formats.STEM_COUNT, formats.CANOPY_HEIGHT,
                           formats.TRANSMISSION, formats.FLOWERING_SCORE,
                           formats.LEAF_AREA_INDEX, formats.COL,
                           formats.ROW, formats.DD, formats.GENOTYPE,
                           formats.RAINFALL, formats.DOY, formats.PAR]
        self._variables.sort()

        self._scenario = scenario
        self._root_dir = root_dir
        if scenario == "simple_ml":
            self._methods = ['rf', 'knn', 'gbm']
        elif scenario == "compound":
            self._methods = ['NaiveMLProcessModelMemfix', 'GAWinModel']
        else:
            raise Exception("STUB")  # TODO

        self._data = self._hack_data(data)

        self._months = list(set([x[formats.DATE].strftime("%B") for x in
                                self._data['ml_data']]))
        self._months.sort()

        # find maximum RMSE for methods
        self._max_rmse = self._get_max_rmse()

        # DB to contain all solutions ever explored
        self._database = ScatterPhenoScenarioContainer()
        self._score_table = self._empty_score_table()
        if load:
            sc_file = os.path.join(self._root_dir, 'score_table.csv')
            self._score_table = CSVFileReader(sc_file).get_content()
            for entry in self._score_table:
                entry['score'] = float(entry['score'])
                entry['value'] = (entry['value'] == "True")

            db_file = os.path.join(self._root_dir, 'database.csv')
            self._database.load_file(db_file, self._data)
            self._update_score_table()

            self._run_algorithm2()
        else:
            self._run_algorithm()
Exemple #7
0
class ScatterPhenoAlgorithm:

    def __init__(self, data, scenario, root_dir, load=False):
        """data - {'cd_data': [], 'ml_data': []}
        scenario - 'simple_ml' or 'process_ml'
        """

        # hardcoded algorithm variables, could supply them to the
        # constructor if needed
        # self._PSize = 45 TODO real value
        self._PSize = 12

        # weight for previous score entry, when updating the score table
        self._alpha = 0.3

        # self._b = 20 TODO real value
        self._b = 8

        self._proc_count = 4

        # set class variables
        self._variables = [formats.STEM_COUNT, formats.CANOPY_HEIGHT,
                           formats.TRANSMISSION, formats.FLOWERING_SCORE,
                           formats.LEAF_AREA_INDEX, formats.COL,
                           formats.ROW, formats.DD, formats.GENOTYPE,
                           formats.RAINFALL, formats.DOY, formats.PAR]
        self._variables.sort()

        self._scenario = scenario
        self._root_dir = root_dir
        if scenario == "simple_ml":
            self._methods = ['rf', 'knn', 'gbm']
        elif scenario == "compound":
            self._methods = ['NaiveMLProcessModelMemfix', 'GAWinModel']
        else:
            raise Exception("STUB")  # TODO

        self._data = self._hack_data(data)

        self._months = list(set([x[formats.DATE].strftime("%B") for x in
                                self._data['ml_data']]))
        self._months.sort()

        # find maximum RMSE for methods
        self._max_rmse = self._get_max_rmse()

        # DB to contain all solutions ever explored
        self._database = ScatterPhenoScenarioContainer()
        self._score_table = self._empty_score_table()
        if load:
            sc_file = os.path.join(self._root_dir, 'score_table.csv')
            self._score_table = CSVFileReader(sc_file).get_content()
            for entry in self._score_table:
                entry['score'] = float(entry['score'])
                entry['value'] = (entry['value'] == "True")

            db_file = os.path.join(self._root_dir, 'database.csv')
            self._database.load_file(db_file, self._data)
            self._update_score_table()

            self._run_algorithm2()
        else:
            self._run_algorithm()

    def _run_algorithm2(self):
        """Algorithm main method"""
        scenario_builder = SPScenarioBuilder(self._data,
                                             self._variables,
                                             process=True)
        scenario_builder.load_cm(os.path.join(self._root_dir,
                                 'cm_functions.csv'))

        # G1
        logging.info("G1")
        population = scenario_builder.g1(self._PSize/3)
        self._update_score_table(population)

        # G2
        logging.info("G2")
        population = self._generator(population, scenario_builder.g2)

        # G3
        logging.info("G3")
        population = self._generator(population, scenario_builder.g3)

        # form ref set from database
        ref_set = self._ref_set_update(self._database)

        self._report(ref_set, scenario_builder)
        # Leaving this here in case I change my mind TODO
        # ref_set = ScatterPhenoScenarioContainer()
        # ref_set.load_file(os.path.join(self._root_dir, 'ref_set.csv'),
        #                   self._data)

        self._main_loop(ref_set, scenario_builder, population)

    def _run_algorithm(self):
        """Algorithm main method"""
        scenario_builder = SPScenarioBuilder(self._data,
                                             self._variables,
                                             process=True)

        # G1
        logging.info("G1")
        start = datetime.now()
        population = scenario_builder.g1(self._PSize/3)
        self._update_score_table(population)
        logging.info("G1 - %s" % (datetime.now() - start))

        # G2
        logging.info("G2")
        start = datetime.now()
        population = self._generator(population, scenario_builder.g2)
        logging.info("G2 - %s" % (datetime.now() - start))

        # G3
        start = datetime.now()
        logging.info("G3")
        population = self._generator(population, scenario_builder.g3)
        logging.info("G3 - %s" % (datetime.now() - start))

        # build ref set
        logging.info("Building ref_set")
        ref_set = self._ref_set_update(population)

        self._report(ref_set, scenario_builder)

        # parallel improvement of the best b/2 solutions
        start = datetime.now()
        logging.info("Improving ref_set")
        ref_set = self._mp_improve(ref_set, scenario_builder)
        logging.info("Improvements - %s" % (datetime.now() - start))

        self._main_loop(ref_set, scenario_builder, population)

    def _main_loop(self, ref_set, scenario_builder, population):
        stop = False
        last_changed = 0
        iteration = 0
        while not stop:
            start_loop = datetime.now()
            # create the pool from combining solutions from ref_set
            logging.info("Performing combinations")
            start = datetime.now()
            pool = self._combine(ref_set, scenario_builder)
            logging.info("Combinations %s" % (datetime.now() - start))

            # improve pool
            logging.info("Improving best combinations")
            start = datetime.now()
            pool = self._mp_improve(pool, scenario_builder)
            logging.info("Improvements %s" % (datetime.now() - start))

            # join ref_set and pool together
            union = deepcopy(ref_set)
            union.add_container(pool)
            union.sort()

            new_ref_set = self._ref_set_update(union)
            if ref_set.same(new_ref_set):
                logging.info("Ref_set not changed")
                new_ref_set = ScatterPhenoScenarioContainer()

                for i in range(self._b/2):
                    new_ref_set.add(union.get(i))

                # get the most diverse solutions to what
                # we already have in ref_set
                while new_ref_set.len() < self._b:
                    new_ref_set.add(population.get_diverse(new_ref_set))

            if ref_set.same(new_ref_set):
                last_changed += 1
            else:
                last_changed = 0

            if last_changed >= 5:
                logging.info("Reached optimal solution, terminating...")
                stop = True

            if os.path.exists('/home/eey9/.stop_scatter_search'):
                stop = True
                logging.info("Stopping because of file flag...")

            ref_set = new_ref_set
            iteration += 1
            logging.info("Completed iteration %d" % iteration)
            self._report(ref_set, scenario_builder)
            t_delta = datetime.now() - start_loop
            logging.info("Iteration time %s" % t_delta)

        return

    def _ref_set_update(self, source):
        source.sort()
        ref_set = ScatterPhenoScenarioContainer()
        for i in range(self._b/2):
            ref_set.add(source.get(i))

        # get the most diverse solutions to what we already have in ref_set
        while ref_set.len() < self._b:
            ref_set.add(source.get_diverse(ref_set))

        return ref_set

    def _combine(self, container, scenario_builder):
        # build subsets
        combinations = self._build_combinations(container)
        pool = ScatterPhenoScenarioContainer()
        for combination in combinations:
            start = datetime.now()
            try:
                new_scenario = scenario_builder.combine(combination[0],
                                                        combination[1],
                                                        self._score_table)
            except NoValidSolutionException:
                logging.info("Combination %d/%d - %s: no valid solution" %
                             (combinations.index(combination) + 1,
                              len(combinations),
                              (datetime.now() - start)))
                continue

            self._update_score_table(new_scenario)
            if not pool.contains(new_scenario):
                pool.add(new_scenario)

            # see where does the scenario qualify to be in container
            try:
                j = container.index(next(x for x in container.get_all()
                                    if new_scenario.get_utility() <
                                    x.get_utility()))
                scenario_builder.success(self._b - j)
            except StopIteration:
                continue  # Worse than anything in ref_set, does not qualify

            logging.info("Combination %d/%d - %s" %
                         (combinations.index(combination) + 1,
                          len(combinations),
                          (datetime.now() - start)))

        return pool

    def _sp_improve(self, container, scenario_builder):
        container.sort()

        best = []
        for i in range(self._b/2):
            best.append(container.get(i))

        result = []
        for scenario in best:
            result.append(self._improve(scenario, scenario_builder))

        for entry in result:
            index = container.index(entry['individual'])
            best = entry['improvements'].get(0)
            if best.get_utility() < entry['individual'].get_utility():
                container.replace(best, index)

            for improvement in entry['improvements'].get_all():
                self._update_score_table(improvement)

        logging.info("Improved %d solutions" % container.get_changes())
        container.reset_changes()
        return container

    def _mp_improve(self, container, scenario_builder):
        """Improves b/2 best solutions from the container and updates
        the score table with the generated solutions
        """
        container.sort()
        pool = Pool(processes=self._proc_count)

        logging.info("Starting processes")
        start = datetime.now()
        best = []
        builders = []
        for i in range(self._b/2):
            best.append(container.get(i))
            builders.append(scenario_builder)

        try:
            result = pool.map(self._improve, best, builders)
            pool.close()
            pool.join()
        except MemoryError as e:
            send_email("I crashed again, please help!")
            import pudb
            pudb.set_trace()
            print(e.message())

        logging.info("Processes finished - %s" % (datetime.now() - start))
        # How infuriating was that?!
        # pathos was being smart and was caching pool so this is needed
        # to prevent from erroring out
        pool.restart()

        start = datetime.now()
        logging.info("mp_improve second loop")
        for entry in result:
            index = container.index(entry['individual'])
            best = entry['improvements'].get(0)
            if best.get_utility() < entry['individual'].get_utility():
                container.replace(best, index)

            for improvement in entry['improvements'].get_all():
                self._update_score_table(improvement)

        logging.info("mp_improve second loop - %s" % (datetime.now() - start))
        logging.info("Improved %d solutions" % container.get_changes())
        container.reset_changes()
        return container

    def _improve(self, individual, scenario_builder):
        start = datetime.now()
        base = importr("base")
        candidate_list = self._build_candidate_list(individual)

        improvements = ScatterPhenoScenarioContainer()

        for var in candidate_list:
            new_scenario = scenario_builder.flip(individual, var)
            if new_scenario.same(individual) or \
                    not new_scenario.valid(process=True):
                continue

            new_scenario = self._evaluate(new_scenario, base)

            if not improvements.contains(new_scenario):
                improvements.add(new_scenario)

        for i in range(len(individual.get_solution())):
            for j in range(i + 1, len(individual.get_solution())):
                new_scenario = scenario_builder.swap(individual, i, j)
                if new_scenario.same(individual) or \
                        not new_scenario.valid(process=True):
                    continue

                new_scenario = self._evaluate(new_scenario, base)
                if not improvements.contains(new_scenario):
                    improvements.add(new_scenario)

                if not self._database.contains(new_scenario):
                    self._database.add(new_scenario)

        improvements.sort()

        logging.info("self._improve finished - %s" %
                     (datetime.now() - start))
        return {'individual': individual,
                'improvements': improvements}

    def _build_candidate_list(self, individual):
        candidate_list = []
        for entry in individual.get_solution():
            t = next(x for x in entry.keys() if x != 'value')
            candidate_list.append(next(x for x in self._score_table
                                       if x['type'] == t and
                                       entry[t] == x['name'] and
                                       entry['value'] != x['value']))

        # smallest score = highest probability so DONT CHANGE THIS
        candidate_list.sort(key=lambda x: x['score'])
        return candidate_list

    def _generator(self, population, func):
        generated = 0
        while generated < self._PSize/3:
            worked = False
            while not worked:
                try:
                    individual = func(self._score_table)
                    population.add(individual)
                    self._update_score_table(individual)
                    generated += 1
                    worked = True
                except ScatterPhenoScenarioContainerException:
                    # already exists
                    pass

        return population

    def _evaluate(self, pheno_scenario, base=None):
        if base is None:
            base = glob_base

        if self._database.contains(pheno_scenario):
            return self._database.request(pheno_scenario)

        model = ScatterPhenoModel(self._data,
                                  base,
                                  pheno_scenario,
                                  self._methods,
                                  self._max_rmse)

        util = (0.3 * pheno_scenario.get_cost()) + model.get_rmse()
        pheno_scenario.set_utility(util)
        pheno_scenario.set_rmse(model.get_absolute_rmse())
        return pheno_scenario

    def _update_score_table(self, input_=None):
        if input_ is None:
            self._score_table = self._calc_table(self._database.get_all())

        elif input_.__class__ == ScatterPhenoScenarioContainer:
            # we are given a population - this should happen after G1
            population = input_.get_all()

            for individual in population:
                if not self._database.contains(individual):
                    self._evaluate(individual)
                    self._database.add(individual)
                else:
                    logging.info("Warning, individual in database, weird!")

            # calculate score based on whole database population
            self._score_table = self._calc_table(self._database.get_all())

        elif input_.__class__ == ScatterPhenoScenario:
            individual = input_
            if self._database.contains(individual):
                # individual already in database, update the individual
                # with the rmse and utility
                index = self._database.index(individual)

                # but only if it needs updating
                if not individual.has_utility():
                    self._database.get(index).copy_to(individual)

                return

            if not individual.is_evaluated():
                self._evaluate(individual)

            # create the table at t
            new_table = self._calc_table(self._database.get_all() +
                                         [individual])

            # use the tables at t and t-1 to calculate the smoothed score
            for entry, new_entry in zip(self._score_table, new_table):
                entry['score'] = (self._alpha * entry['score'] +
                                  (1 - self._alpha) * new_entry['score'])

            self._database.add(individual)

    def _calc_table(self, population):
        new_table = []
        # using self._score_table just for template here,
        # none of the values will be copied
        for entry in self._score_table:
            new_entry = deepcopy(entry)
            index = self._score_table.index(entry)/2

            match = [x for x in population
                     if x.get(index) == entry['value']]
            non_match = [x for x in population
                         if x.get(index) != entry['value']]

            # little hack to get around certain variables not having
            # both values
            if len(match) == 0 or len(non_match) == 0:
                score = 0.5
            else:
                util_match = (sum([x.get_utility() for x in match]) /
                              len(match))
                util_non_match = (sum([x.get_utility()
                                       for x in non_match]) /
                                  len(non_match))

                score = util_match / (util_match + util_non_match)

            new_entry['score'] = score
            new_table.append(new_entry)

        return new_table

    def _get_max_rmse(self):
        rmses = dict()
        for method in self._methods:
            model = MLProcessModel(method, self._data, self._data,
                                   self._variables, glob_base)

            rmses[method] = model._rmse_abs

        return rmses

    def _build_combinations(self, container):
        container.sort()
        population = container.get_all()

        combinations = []
        for i in range(len(population)):
            for j in range(i + 1, len(population)):
                combination = [population[i], population[j]]
                combinations.append(combination)

        return combinations

    def _hack_data(self, data):
        # remove winter months that are useless
        for key in data.keys():
            new_data = [x for x in data[key]
                        if x[formats.DATE].month > 3 and
                        x[formats.DATE].month < 11]
            data[key] = new_data

        # remove records with missing values for needed variables
        needed_vars = deepcopy(self._variables)

        ml_data = []
        for entry in data['ml_data']:
            add = True
            for key in needed_vars:
                if entry[key] is None:
                    add = False
                    break

            if add:
                ml_data.append(entry)

        data['ml_data'] = ml_data
        return data

    def _empty_score_table(self):
        score_table = []
        for month in self._months:
            for val in [False, True]:
                entry = dict()
                entry['name'] = month
                entry['type'] = 'month'
                entry['value'] = val
                entry['score'] = 0
                score_table.append(entry)

        for var in self._variables:
            for val in [False, True]:
                entry = dict()
                entry['name'] = var
                entry['type'] = 'variable'
                entry['value'] = val
                entry['score'] = 0
                score_table.append(entry)

        return score_table

    def _report(self, ref_set, sb):
        self._to_csv(os.path.join(self._root_dir, 'ref_set.csv'), ref_set)
        CSVFileWriter(os.path.join(self._root_dir, 'score_table.csv'),
                      self._score_table)
        self._to_csv(os.path.join(self._root_dir, 'database.csv'),
                     self._database)

        # delete the function reference from the dict and save the info to file
        func = deepcopy(sb._cm_functions)
        for f in func:
            del f['function']

        CSVFileWriter(os.path.join(self._root_dir, 'cm_functions.csv'), func)

        # report memory usage
        mem_report = [{'date': datetime.now().strftime("%H:%M:%S %d/%m/%Y"),
                       'mem': memory()/float(1024**2),
                       'res_memory': res_memory(),
                       'total_memory': total_memory()}]

        memfile = os.path.join(self._root_dir, 'memfile.csv')
        if not os.path.exists(memfile):
            CSVFileWriter(memfile, mem_report)
        else:
            CSVFileWriter(memfile, mem_report, write_mode='a')

    def _to_csv(self, fname, container):
        data = [x.to_dict() for x in container.get_all()]
        CSVFileWriter(fname, data)