Ejemplo n.º 1
0
def test_set_subtree_tuple():
    """Test that set_subtree works if
    child_index_list is specified by tuple"""

    tree = GP.Tree('(* (x0) (* (x0) (x0)))', actual_lisp=False)
    tree2 = GP.Tree('(- (x0) (3))', actual_lisp=False)
    tree.set_subtree(new_subtree=tree2.tree, child_indices=(1, 0))
    assert tree.get_lisp_string() == '(* (x0) (* (- (x0) (3)) (x0)))'
Ejemplo n.º 2
0
def test_convert_to_standard_10():
    """Test if this works for x_n where
    n >= 10."""

    tree = GP.Tree('(x143)', num_vars=200)
    standard = tree.convert_lisp_to_standard(None)
    assert standard == 'x[143]'
Ejemplo n.º 3
0
def test_convert_to_standard():
    """Test if this works for a single
    constant."""

    tree = GP.Tree('(0)', actual_lisp=False)
    standard = tree.convert_lisp_to_standard(None)
    assert standard == '0+0*x[0]'
Ejemplo n.º 4
0
def test_select_subtree_tuple():
    """Test that select_subtree works if
    child_index_list is specified by tuple"""

    tree = GP.Tree('(* (x0) (* (x1) (x2)))', actual_lisp=False, num_vars=3)
    subtree = tree.select_subtree(child_indices=(1, 0))
    assert subtree == ['x1']
Ejemplo n.º 5
0
def test_convert_to_standard_10_more_nodes():
    """Test if this works for x_n where
    n >= 10 for a tree with more than one node."""

    tree = GP.Tree('(* x143 c3)', num_vars=200, actual_lisp=True)
    standard = tree.convert_lisp_to_standard({'*': 'Mult'})
    assert standard == 'Mult(x[143],c[3])'
Ejemplo n.º 6
0
def setup_individual():

    I = GP.Individual(np.random.RandomState(0),
                      primitive_set=['*', '+', '-'],
                      terminal_set=['#x'],
                      AFSPO=False)

    return I
Ejemplo n.º 7
0
def test_convert_to_standard_exponents_one_node():
    """Test if this works for x_n where
    n >= 10 for a tree with more than one node."""

    tree_list = ['x[0]**2']
    tree = GP.Tree(tree_list)
    standard = tree.convert_lisp_to_standard({'*': 'Mult'})
    assert standard == 'x[0]**2'
Ejemplo n.º 8
0
def test_get_tree_size_special_counts2():
    """Test that get_tree_size works with
    optional special_counts argument"""

    special_counts = {'%': 8, 'AQ': 8}

    tree = GP.Tree('(% (x0) (AQ (8) (2)))')
    result = tree.get_tree_size(special_counts=special_counts)
    assert result == 19
Ejemplo n.º 9
0
    def is_equation(self, eq):
        """Check if eq is an equation

        Parameters
        ----------
        eq : str
            A lisp (s-expression, infix notation) of an equation?

        Returns
        -------
         : bool
            If True, eq is an equation in the form of a lisp.
        """

        for element in eq.split(' '):

            element = element.replace('(', '').replace(')', '')

            constant = False

            if '#f' in self.terminal_set:
                try:
                    float(element)
                    constant = True
                except ValueError:
                    pass

            if constant:
                pass
            elif element not in self.primitive_set and element not in self.terminal_set:
                return False

        try:
            t = GP.Tree(rng=self.rng,
                        primitive_set=self.primitive_set,
                        terminal_set=self.terminal_set,
                        tree=eq,
                        actual_lisp=True,
                        num_vars=self.num_data_encoder_inputs - 1)
            f = eval('lambda x:' +
                     t.convert_lisp_to_standard_for_function_creation())

            f([1] *
              (self.num_data_encoder_inputs - 1))  # try to evaluate it at x=1.
            self.f_hat = f
            return True

        except SyntaxError:
            return False
        except ValueError:
            return False
Ejemplo n.º 10
0
def test_get_tree_size():
    """Test that get_tree_size works"""
    print('test_get_tree_size')
    tree = GP.Tree(tree='(% (3) (x0))', actual_lisp=False)
    result = tree.get_tree_size()
    assert result == 3
Ejemplo n.º 11
0
def test_get_num_leaves():

    tree = GP.Tree('(- (3) (+ (x0) (3)))')
    counts = tree.get_num_leaves(return_num_nodes=True)

    assert counts == (3, 5)
Ejemplo n.º 12
0
def build_tree(gene, return_short_gene=False):
    """Take list version of equation
    and make a tree.

    Parameters
    ----------
    gene : list
        The equation as a list of labels. This
        is the typcial form in GEP.
    return_short_gene : bool (default=False)
        gene with unnecessary elements removed

    Returns
    -------
    lisp : str
        The lisp represented by gene
    short_gene (if return_short_gene=True) : list
        short_gene = gene[:b] for smallest b such that
        build_tree(short_gene) = build_tree(gene)
    """

    # Reorganize gene into a list of lists
    # where each sublist is from the same layer
    # of the tree. Call this new list tree_list.
    tree_list = [[gene[0]]]
    tree_list_index = 0
    current_index = 1

    while current_index < len(gene):

        # Get the number of nodes in the next layer.
        num_next_layer = sum([
            required_children[label] for label in tree_list[tree_list_index]
            if label in required_children
        ])

        # In case there are extra elements in gene,
        # check that the next level should actually
        # exist.
        if num_next_layer == 0:
            break

        tree_list.append(gene[current_index:current_index + num_next_layer])

        current_index += num_next_layer
        tree_list_index += 1

    # Gets a dictionary that relates index of gene
    # (list of tree labels) to the position of each
    # label in the tree.
    locations = get_location_map(gene, tree_list)

    # Get a new dictionary that has keys of the locations
    # and values of the labels.
    tree_dict = collections.OrderedDict()

    for i in range(len(locations)):
        tree_dict[locations[i]] = gene[i]

    # Now create a tree with networkx.
    # This can probably be avoided by further modifying
    # the nx.to_nested_tuple() function.
    T = nx.Graph()

    for loc in tree_dict:
        T.add_node(loc, label=tree_dict[loc])

        if loc != ():
            T.add_edge(loc, loc[:-1])

    tree = to_s_expression(T, ())

    # Not using rng here so don't need to specify it
    t = GP.Individual(rng=None,
                      primitive_set=list(required_children.keys()),
                      terminal_set=['#x', '#f'],
                      tree=tree)
    lisp = t.get_lisp_string(actual_lisp=True)

    if return_short_gene:

        short_gene_end_index = sum([len(x) for x in tree_list])
        short_gene = gene[:short_gene_end_index]

        return lisp, short_gene

    else:
        return lisp
Ejemplo n.º 13
0
    def rewrite_equation(self,
                         x,
                         y,
                         f_hat,
                         f_hat_seq,
                         initial_states=np.zeros(8)[None, :],
                         return_equation=False,
                         return_equation_str=False,
                         return_decoded_list=False):
        """Rewrite the equation using the network.

        Parameters
        ----------
        x : np.array
            The input data for the dataset. (one column for each input var)
        y : np.array
            The output data for the dataset. (one column)
        f_hat : function
            The current approximation of the dataset.
        f_hat_seq : list
            List of strings. Each string is a token
            in the equation.
        initial_states : np.array (default all zeros)
            The initial states of the encoders hidden
            values.
        return_equation : bool
            If true, return the function (which does equation)
            output by the netork
        return_equation_str : bool
            If true, return the string representation
            of the equation output by network.
        return_decoded_list : bool
            If true, return the output of the network after
            it has been converted to tokens.

        Returns
        -------
        output : dict
            The keys depend on the return_... parameters.
        """

        self.f_hat = f_hat

        # Get input ready.
        if self.options['quick_gens']:
            dataset_indices = self.rng.choice(len(x), 20, replace=False)
            sorted_dataset_indices = sorted(dataset_indices)

            get_row_set = lambda col1, col2: set([(*a, *b)
                                                  for a, b in zip(col1, col2)])
            assert get_row_set(
                x[dataset_indices], y[dataset_indices]
            ) == get_row_set(
                x[sorted_dataset_indices], y[sorted_dataset_indices]
            ), 'Not the same dataset (not just reordered)! If full dataset is not ordered, this makes sense'

            data_encoder_input_data = self.get_data_encoder_input_data(
                x[sorted_dataset_indices], y[sorted_dataset_indices], f_hat,
                self.num_data_encoder_inputs)
        else:
            data_encoder_input_data = self.get_data_encoder_input_data(
                x, y, f_hat, self.num_data_encoder_inputs)

        eq_encoder_input_data = self.get_eq_encoder_input_data(f_hat_seq)
        decoder_input_data = self.get_decoder_input_data()

        all_network_outputs = self.get_network_output(initial_states,
                                                      data_encoder_input_data,
                                                      eq_encoder_input_data,
                                                      decoder_input_data)

        # decoder output
        if len(all_network_outputs) == 1:
            prediction = all_network_outputs
        else:
            prediction = all_network_outputs[0]

        if self.use_constants:
            constant_value = all_network_outputs[1]

        extra_outputs = all_network_outputs[2:]
        output = {
            key: value
            for key, value in zip(['state_h1', 'state_h2', 'encoder_output'],
                                  extra_outputs)
        }

        self.effort += self.effort_in_eval(
            eq_input_len=len(eq_encoder_input_data[0]),
            data_input_len=len(data_encoder_input_data[0]))

        if self.options['use_k-expressions']:

            for i, row in enumerate(prediction[0]):
                if i >= self.head_length:
                    prediction[0, i, self.terminal_indices] += 2.
                else:
                    prediction[0, i, self.not_start_indices] += 2.

        # decoded in terms of seq2seq network -- still a k-expression
        if self.use_constants:
            decoded_string = self.read_decoded_output(
                outputs=prediction, const_outputs=constant_value)

        else:
            decoded_string = self.read_decoded_output(outputs=prediction)

        decoded_list = decoded_string.split(' ')

        # If NN has output a STOP, ignore the rest
        # of the output.
        try:
            index = decoded_list.index('STOP')
            decoded_list = decoded_list[:index]

        except ValueError:
            # STOP not in decoded_list, so don't worry about removing it.
            pass

        # Remove START token
        decoded_list = decoded_list[1:]

        if not self.options['use_k-expressions']:
            # We will adjust this value, if decoded_list
            # actually represents an equation.
            error = self.get_penalty(decoded_list,
                                     primitive_set=self.primitive_set,
                                     terminal_set=self.terminal_set)

        # nan's can appear in the output of the network
        # if inf's are subtracted. inf's can appear when
        # weights are too large, which is easier to do
        # with reucurrance.
        if np.any(np.isnan(prediction)):
            error = float('inf')

        elif 'START' in decoded_list:
            print('ERROR: START is in decoded_list')
            print('decoded_list =', decoded_list)
            exit()

        # if START is in decoded list, keep the penalty already computed
        # otherwise get the actual error
        elif 'START' not in decoded_list:

            if self.options['use_k-expressions']:

                lisp, short_gene = build_tree(decoded_list,
                                              return_short_gene=True)

                if self.options['save_lisp_summary']:
                    self.summary_data.append(
                        self.get_lisp_summary(lisp, self.primitive_set,
                                              self.terminal_set))

            else:

                num_terminals = len(
                    [x for x in decoded_list if x in self.terminal_set])
                num_primitives = len(
                    [x for x in decoded_list if x in self.primitive_set])

                if num_primitives + 1 != num_terminals:
                    lisp = None

                else:
                    # This value might be None if decoded_string is not a
                    # stripped_lisp
                    lisp = self.get_lisp_from_stripped_lisp(
                        decoded_list, self.primitive_set)

            if lisp is not None:
                if self.is_equation(lisp):

                    t = GP.Individual(rng=None,
                                      primitive_set=self.primitive_set,
                                      terminal_set=self.terminal_set,
                                      tree=lisp,
                                      actual_lisp=True,
                                      num_vars=self.num_data_encoder_inputs -
                                      1)

                    # if x is the wrong size, adjust
                    num_x_input = len(x[0])

                    if num_x_input < self.num_data_encoder_inputs - 1:
                        x_adjusted = np.zeros(
                            (len(x), self.num_data_encoder_inputs - 1))
                        x_adjusted[:, :num_x_input] = x.copy()

                    else:
                        x_adjusted = x.copy()

                    # t.evaluate_fitness(dataset, compute_val_error=False)
                    f_string = t.convert_lisp_to_standard_for_function_creation(
                    )

                    self.f_hat = get_function(f_string)
                    error = RMSE(x=x_adjusted, y=y, f=self.f_hat)

                    # this will count only one tree because t is from Individual
                    # not IndividualManyTargetData
                    dataset = [cdff.combine_x_y(x_adjusted, y), []]
                    self.effort += t.get_effort_tree_eval(dataset)

                else:
                    print('ERROR: lisp is not an equation')
                    print('lisp =', lisp)
                    exit()

            else:
                print('ERROR: lisp is None')
                print('decoded_list =', decoded_list)
                exit()
        else:
            print('Should not get to this point in code')
            exit()

        output['error'] = error

        if return_equation:
            output['equation'] = self.f_hat

        if return_equation_str:
            output['equation_str'] = lisp if error != float('inf') else None

        if return_decoded_list:
            output['decoded_list'] = decoded_list

        output['raw_decoded_list'] = decoded_list

        return output
Ejemplo n.º 14
0
    def get_lisp_summary(lisp, primitive_set, terminal_set):

        counts = {key: 0 for key in primitive_set + terminal_set}
        counts['unique subtrees under -'] = 0
        counts['- simplified'] = 0

        for char in lisp.split(' '):

            char = char.replace(')', '').replace('(', '')

            if '#f' in terminal_set and char not in primitive_set + terminal_set:
                counts['#f'] += 1
            else:
                counts[char] += 1

        if counts['-'] > 0:
            i = GP.Individual(rng=None,
                              primitive_set=primitive_set,
                              terminal_set=terminal_set,
                              tree=lisp,
                              actual_lisp=True)

            node_map = i.get_node_map()

            # get order function of loc's
            order = lambda tup: 2**len(tup) + int(
                ''.join(map(str, tup)), base=2) if tup is not () else 1

            # keep track of zeros found
            zeros = []

            # node_map['-'] = set of locations with - label
            for loc in sorted(node_map['-'], key=order):

                # check if loc is inside a zero
                inside = False
                for z in zeros:
                    if GP.Tree.is_elder(elder=z, child=loc):
                        inside = True
                        break

                # see if loc is a zero
                if not inside:
                    subtree = i.select_subtree(loc)
                    f_str = 'lambda x: ' + i.convert_lisp_to_standard_for_function_creation(
                    )
                    f = eval(f_str)
                    x = np.linspace(-1, 1, 1000)
                    if np.abs(f(x)) <= 10**(-9):
                        counts['- simplified'] += 1
                        zeros.append(loc)

                loc_left = (*loc, 0)
                loc_right = (*loc, 1)

                lisp_left = i.get_lisp_string(
                    subtree=i.select_subtree(loc_left))
                lisp_right = i.get_lisp_string(
                    subtree=i.select_subtree(loc_right))

                if lisp_left != lisp_right:
                    counts['unique subtrees under -'] += 1

        return counts