Beispiel #1
0
class GA_FeatureSelection():

    def __init__(self, UCM, URM_train, test_playlists_indices, logFile, bestIndividualFile, mode="selection",
                 numGenerations=30, populationSize=30, initialRandomDistribution=np.random.uniform(0, 1),
                 verbose=True):

        self.UCM = UCM
        self.URM_train = URM_train
        self.test_playlists_indices = test_playlists_indices.astype(np.int)
        self.logFile = open(logFile, "a")
        self.bestIndividualFile = open(bestIndividualFile, "a")
        self.initialRandomDistribution = initialRandomDistribution
        self.verbose = verbose
        self.top = 0

        self.current = 0

        self.evaluator = Evaluator(Datareader(mode='offline', only_load=True, verbose=False))

        self.NUM_VARIABLES = UCM.shape[1]

        if (mode == "weighting" or mode == "selection"):
                self.mode = mode

        # Crossover probability
        self.CXPB = 0.5

        # Mutation probability
        self.MUTPB = 0.2

        # Number of generations for which the evolution runs
        self.NGEN = numGenerations

        self.POPULATION_SIZE = populationSize


    def writeOnLogFile(self, stringToLog):
        self.logFile.write(stringToLog + "\n")
        self.logFile.flush()

    def writeOnBestIndividualFile(self, stringToLog):
        self.bestIndividualFile.write(stringToLog + "\n")
        self.bestIndividualFile.flush()

    # Set the max number of features
    def isIndividualAccettable(self, individual):
        return np.sum(np.array(individual)) < 10000

    def fitnessFunction(self, individual):

        # Convert list into a numpy array
        individual = np.array(individual)

        # Make a copy of the UCM and filter it for each column
        if self.verbose:
            print('Filtering UCM...')
        start = time.time()
        UCM_filtered = self.UCM.copy()
        UCM_filtered = UCM_filtered.astype(np.float64)
        inplace_csr_column_scale(UCM_filtered, individual)
        if self.verbose:
            print('UCM filtered in', time.time() - start, 'sec')

        # Compute similarity
        if self.verbose:
            print('Computing similarity...')
        start = time.time()
        similarity = tversky_similarity(UCM_filtered, shrink=200, alpha=0.1,
                                        beta=1, target_items=self.test_playlists_indices,
                                        binary=False)
        similarity = similarity.tocsr()
        if self.verbose:
            print('Similarity computed in', time.time() - start, 'sec')

        # Compute eurm
        if self.verbose:
            print('Computing eurm...')
        start = time.time()
        eurm = dot_product(similarity, self.URM_train, k=500)
        if self.verbose:
            print('eurm computed in', time.time() - start, 'sec')
            print('Converting eurm in csr...')
        start = time.time()
        eurm = eurm.tocsr()
        eurm = eurm[self.test_playlists_indices, :]
        if self.verbose:
            print('eurm converted in', time.time() - start, 'sec')

        # Evaluate
        rec_list = eurm_to_recommendation_list(eurm)
        print('current', self.current)

        score_cat_1 = self.evaluator.evaluate_single_metric(rec_list, name='Genetic', metric='prec',
                                                            level='track', cat=1, verbose=False)
        score_cat_2 = self.evaluator.evaluate_single_metric(rec_list, name='Genetic', metric='prec',
                                                            level='track', cat=2, verbose=False)
        score = (score_cat_1 + score_cat_2) / 2

        self.current += 1

        if self.verbose:
            print(score)

        print("Numfeatures {}".format(np.sum(individual)))
        print('\n')

        return score,

    def setupParameters(self):

        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMax)

        self.toolbox = base.Toolbox()

        # Attribute generator
        # define 'attr_bool' to be an attribute ('gene')
        # which corresponds to integers sampled uniformly
        # from the range [0,1] (i.e. 0 or 1 with equal
        # probability)

        # Structure initializers
        # define 'individual' to be an individual
        # consisting of 100 'attr_bool' elements ('genes')

        if (self.mode == "weighting"):
            self.toolbox.register("attr_float", self.initialRandomDistribution)
            self.toolbox.register("individual", tools.initRepeat, creator.Individual,
                                  self.toolbox.attr_float, self.NUM_VARIABLES)

        elif (self.mode == "selection"):
            # self.toolbox.register("attr_bool", random.randint, 0, 1)
            self.toolbox.register("attr_bool", self.initialRandomDistribution)
            self.toolbox.register("individual", tools.initRepeat, creator.Individual,
                                  self.toolbox.attr_bool, self.NUM_VARIABLES)

        # define the population to be a list of individuals
        self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual)

        # ----------
        # Operator registration
        # ----------
        # register the goal / fitness function
        self.toolbox.register("evaluate", self.fitnessFunction)
        # self.toolbox.decorate("evaluate", tools.DeltaPenality(self.isIndividualAccettable, -1.0))

        # register the crossover operator
        self.toolbox.register("mate", tools.cxTwoPoint)

        # register a mutation operator with a probability to
        # flip each attribute/gene of 0.05
        if self.mode == "weighting":
            self.toolbox.register("mutate", randomMutationCustom, indpb=0.05)

        elif self.mode == "selection":
            self.toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)

        # operator for selecting individuals for breeding the next
        # generation: each individual of the current generation
        # is replaced by the 'fittest' (best) of three individuals
        # drawn randomly from the current generation.
        self.toolbox.register("select", tools.selTournament, tournsize=3)
        # self.toolbox.register("select", tools.selRandom)


    def main(self):

        self.start_time = time.time()

        random.seed(64)

        self.setupParameters()

        self.writeOnLogFile(time.strftime("%Y-%m-%d %H:%M") + "\n")
        self.writeOnBestIndividualFile(time.strftime("%Y-%m-%d %H:%M") + "\n")

        # create an initial population of 300 individuals (where
        # each individual is a list of integers)
        pop = self.toolbox.population(n=self.POPULATION_SIZE)

        print("Start of evolution")

        self.current = 0

        # Evaluate the entire population
        fitnesses = list(map(self.toolbox.evaluate, pop))
        for ind, fit in zip(pop, fitnesses):
            ind.fitness.values = fit

        print("  Evaluated %i individuals" % len(pop))

        # Begin the evolution
        for g in range(self.NGEN):
            print("-- Generation %i --" % g)

            self.writeOnLogFile("-- Generation %i --" % g)

            # Select the next generation individuals
            offspring = self.toolbox.select(pop, len(pop))
            # Clone the selected individuals
            offspring = list(map(self.toolbox.clone, offspring))

            # Apply crossover and mutation on the offspring
            for child1, child2 in zip(offspring[::2], offspring[1::2]):

                # cross two individuals with probability CXPB
                if random.random() < self.CXPB:
                    self.toolbox.mate(child1, child2)

                    # fitness values of the children
                    # must be recalculated later
                    del child1.fitness.values
                    del child2.fitness.values

            for mutant in offspring:

                # mutate an individual with probability MUTPB
                if random.random() < self.MUTPB:
                    self.toolbox.mutate(mutant)
                    del mutant.fitness.values

            # Evaluate the individuals with an invalid fitness
            invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
            fitnesses = map(self.toolbox.evaluate, invalid_ind)
            for ind, fit in zip(invalid_ind, fitnesses):
                ind.fitness.values = fit

            print("  Evaluated %i individuals" % len(invalid_ind))

            # The population is entirely replaced by the offspring
            pop[:] = offspring

            # Gather all the fitnesses in one list and print the stats
            fits = [ind.fitness.values[0] for ind in pop]

            length = len(pop)
            mean = sum(fits) / length
            sum2 = sum(x*x for x in fits)
            std = abs(sum2 / length - mean**2)**0.5

            # Update top value
            if max(fits) > self.top:
                self.top = max(fits)

                # Write on log file
                best_ind = tools.selBest(pop, 1)[0]
                self.writeOnBestIndividualFile('GEN ' + str(g) + ' | ' + str(self.top))
                self.writeOnBestIndividualFile("%s" % best_ind + '\n')

            print("  Top %s" % self.top)
            print("  Min %s" % min(fits))
            print("  Max %s" % max(fits))
            print("  Avg %s" % mean)
            print("  Std %s" % std)

            self.writeOnLogFile("  Top %s\n" % self.top +
                                "  Min %s\n" % min(fits) +
                                "  Max %s\n" % max(fits) +
                                "  Avg %s\n" % mean +
                                "  Std %s\n" % std)

        print("-- End of (successful) evolution --")

        best_ind = tools.selBest(pop, 1)[0]
        print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values[0]))

        print("Elapsed time" + str(time.time()-self.start_time))
a = float(sys.argv[1])
b = float(sys.argv[2])
c = float(sys.argv[3])
d = float(sys.argv[4])
e = float(sys.argv[5])
f = float(sys.argv[6])
g = float(sys.argv[7])

res = ensembler(matrix, [a, b, c, d, e, f, g], normalization_type="max")

ev = Evaluator(dr)
ret = [
    -ev.evaluate_single_metric(eurm_to_recommendation_list(res, cat=cat),
                               cat=cat,
                               name="ens" + str(cat),
                               metric='prec',
                               level='track')
]

if os.path.isfile("best.npy"):
    best = np.load("best.npy")
    if ret[0] < best[-1].astype(np.float):
        b = sys.argv[1:]
        b.append(ret[0])
        np.save("best", b)
else:
    b = sys.argv[1:]
    b.append(ret[0])
    np.save("best", b)
class Optimizer(object):
    def __init__(self,
                 matrices_names,
                 matrices_array,
                 dr,
                 cat,
                 start,
                 end,
                 n_calls=1000,
                 n_random_starts=0.1,
                 n_points=50,
                 step=0.001,
                 verbose=True):
        self.target_metric = 'ndcg'
        self.best_score = 0
        self.best_params = 0
        self.norm = norm_max_row
        self.verbose = verbose

        self.n_cpu = int(multiprocessing.cpu_count() / 10)
        if self.n_cpu == 0:
            self.n_cpu = 1
        # Do not edit
        self.start = start
        self.end = end
        self.cat = cat
        self.global_counter = 0
        self.start_index = (cat - 1) * 1000
        self.end_index = cat * 1000
        self.matrices_array = list()
        self.matrices_names = matrices_names
        self.n_calls = n_calls
        self.global_counter = 0
        self.x0 = None
        self.y0 = None
        self.n_random_starts = int(n_calls * n_random_starts)
        self.n_points = n_points
        self.step = step
        # memory_on_disk= False
        self.memory_on_notebook = True
        self.dr = dr
        self.ev = Evaluator(self.dr)

        for matrix in matrices_array:
            self.matrices_array.append(
                self.norm(
                    eurm_remove_seed(
                        matrix,
                        datareader=dr)[self.start_index:self.end_index]))

        del self.dr, matrices_array

    def run(self):
        self.x0 = None
        self.y0 = None
        space = [
            Real(self.start, self.end, name=x) for x in self.matrices_names
        ]
        self.res = gp_minimize(self.obiettivo,
                               space,
                               base_estimator=None,
                               n_calls=self.n_calls,
                               n_random_starts=self.n_random_starts,
                               acq_func='gp_hedge',
                               acq_optimizer='auto',
                               x0=self.x0,
                               y0=self.y0,
                               random_state=None,
                               verbose=self.verbose,
                               callback=None,
                               n_points=self.n_points,
                               n_restarts_optimizer=10,
                               xi=self.step,
                               kappa=1.96,
                               noise='gaussian',
                               n_jobs=self.n_cpu)

    def obiettivo(self, x):

        eurm = sum(x[i] * matrix
                   for i, matrix in enumerate(self.matrices_array))

        # real objective function
        ris = -self.ev.evaluate_single_metric(eurm_to_recommendation_list(
            eurm, cat=self.cat, remove_seed=False, verbose=False),
                                              verbose=False,
                                              cat=self.cat,
                                              name="ens" + str(self.cat),
                                              metric=self.target_metric,
                                              level='track')
        # memory variables
        if self.x0 is None:
            self.x0 = [[x]]
            self.y0 = [ris]
        else:
            self.x0.append(x)
            self.y0.append(ris)

        self.global_counter += 1
        if ris < self.best_score:
            print("[NEW BEST]")
            self.pretty_print(ris, x)
            self.best_score = ris
            self.best_params = x.copy()
            self.best_params_dict = dict(zip(self.matrices_names, x.copy()))
            b = list()
            if os.path.isfile("best/cat" + str(self.cat) + ".plk"):
                b.append(self.best_params_dict)
                b.append(ris)
                save_obj(b, "best/cat" + str(self.cat))
            else:
                b.append(self.best_params_dict)
                b.append(ris)
                save_obj(b, "best/cat" + str(self.cat))
        elif self.verbose:
            self.pretty_print(ris, x)

        return ris

    def pretty_print(self, ris, x):
        print("CAT:",
              self.cat,
              "ITER:",
              self.global_counter,
              "RES:",
              ris,
              end="\tvals:\t")
        for i in range(len(x)):
            print(self.matrices_names[i], "%.4f" % (x[i]), end="\t")
        print()
        print("-" * 80)
        print()