Esempio n. 1
0
    def do_iteration(self) -> bool:
        """
        Do one iteration.

        :return: False if the current iteration failed, else True.
        """
        logger.info(f"Iteration {self.iteration}")
        done = self._reset_and_check_if_done()
        if done:
            return True

        (
            chosen_candidate_node,
            biggest_multiset,
        ) = self.compute_multisets_and_get_biggest()
        if sum(biggest_multiset.values()) == 0:
            logger.info("Biggest multiset has cardinality 0, done")
            return True

        non_distinct_vertices = self._compute_non_distinct_vertices(
            chosen_candidate_node)
        self._add_new_state_or_edge(chosen_candidate_node,
                                    non_distinct_vertices)
        # end of iteration
        self.iteration += 1
        done = self.iteration >= self.iteration_upper_bound
        return done
Esempio n. 2
0
def _compute_N(params: PalmerParams, m0: int):
    """Compute N."""
    eps = params.epsilon
    delta = params.delta_1
    n = params.n
    s = params.alphabet_size

    N1 = 8 * (n ** 2) * (s ** 2) / (eps ** 2) * (log((2 ** (n * s)) * n * s / delta))
    N2 = 4 * m0 * n * s / eps
    N = ceil(max(N1, N2))
    logger.info(f"N1 = {N1}, N2 = {N2}. Chosen: {N}")
    return N
Esempio n. 3
0
def learn_pdfa(**kwargs) -> PDFA:
    """
    PAC-learn a PDFA.

    This is a wrapper function to the 'Learner' class, defined below.

    :param kwargs: the keyword arguments of the algorithm (see the BalleParams class).
    :return: the learnt PDFA.
    """
    params = BalleParams(**kwargs)
    logger.info(f"Parameters: {pprint.pformat(str(params))}")
    automaton = Learner(params).learn()
    return automaton
Esempio n. 4
0
 def _sample_and_update(self):
     """Do the sampling."""
     logger.info("Generating the sample.")
     if self.params.sample_generator:
         generator = self.params.sample_generator
         samples = generator.sample(n=self.params.nb_samples)
         samples = list(map(lambda x: tuple(x), samples))
     else:
         samples = self.params.dataset
     self.average_trace_length = sum(map(len, samples)) / len(samples)
     logger.info(f"Average trace length: {self.average_trace_length}.")
     logger.info("Populate root multiset.")
     self.main_multiset.update(samples)
Esempio n. 5
0
def learn_pdfa(**kwargs):
    """
    PAC-learn a PDFA.

    :param kwargs: the keyword arguments of the algorithm (see the PalmerParams class).
    :return: the learnt PDFA.
    """
    params = PalmerParams(**kwargs)
    logger.info(f"Parameters: {pprint.pformat(str(params))}")
    vertices, transitions = learn_subgraph(params)
    logger.info(f"Number of vertices: {len(vertices)}.")
    logger.info(f"Transitions: {pprint.pformat(transitions)}.")
    pdfa = learn_probabilities((vertices, transitions), params)
    return pdfa
Esempio n. 6
0
def learn_probabilities(graph: Tuple[Set[int], Dict[int, Dict[int, int]]],
                        params: PalmerParams) -> PDFA:
    """
    Learn the probabilities of the PDFA.

    :param graph: the learned subgraph of the true PDFA.
    :param params: the parameters of the algorithms.
    :return: the PDFA.
    """
    logger.info("Start learning probabilities.")
    vertices, transitions = graph
    initial_state = 0
    N = _sample_size(params)
    logger.info(f"Sample size: {N}.")
    N = min(N, params.n2_max_debug if params.n2_max_debug else N)
    logger.info(f"Using N = {N}.")
    generator = params.sample_generator
    sample = generator.sample(N)
    n_observations: Counter = Counter()
    for word in sample:
        current_state = initial_state
        for character in word:
            # update statistics

            n_observations.update([(current_state, character)])

            # compute next state
            next_state: Optional[int] = transitions.get(current_state,
                                                        {}).get(character)

            if next_state is None:
                break  # pragma: no cover
            current_state = next_state

    gammas: Dict[int, Dict[int, float]] = {}

    # compute number of times q is visited
    q_visits: Counter = Counter()
    for (q, _), counts in n_observations.items():
        q_visits[q] += counts
    # compute mean
    for (q, sigma), counts in n_observations.items():
        gammas.setdefault(q, {})[sigma] = counts / q_visits[q]
    # rescale probabilities
    for _, out_probabilities in gammas.items():
        characters, probabilities = zip(*list(out_probabilities.items()))
        probability_sum = math.fsum(probabilities)
        new_probabilities = [p / probability_sum for p in probabilities]
        out_probabilities.update(dict(zip(characters, new_probabilities)))

    # compute transition function for the PDFA
    transition_dict: TransitionFunctionDict = {}
    for q, out_transitions in transitions.items():
        transition_dict.setdefault(q, {})
        for sigma, q_prime in out_transitions.items():
            prob = gammas.get(q, {}).get(sigma, 0.0)
            transition_dict[q][sigma] = (q_prime, prob)

    logger.info(f"Computed vertices: {pprint.pformat(vertices)}")
    logger.info(
        f"Computed transition dictionary: {pprint.pformat(transition_dict)}")

    return PDFA(len(vertices), params.alphabet_size, transition_dict)
Esempio n. 7
0
def learn_subgraph(  # noqa: ignore
    params: PalmerParams,
) -> Tuple[Set[int], Dict[int, Dict[Character, int]]]:
    """
    Learn a subgraph of the true PDFA.

    :param params: the parameters of the algorithms.
    :return: the graph
    """
    # unpack parameters
    generator = params.sample_generator
    mu = params.mu

    # initialize variables
    initial_state = 0
    vertices = {initial_state}
    transitions: Dict[int, Dict[Character, int]] = {}
    alphabet = set(range(params.alphabet_size))
    vertex2multiset: Dict[int, Counter] = {}

    m0 = _compute_m0(params)
    N = _compute_N(params, m0)
    logger.info(f"m0 = {m0}")
    logger.info(f"N = {N}")
    m0 = min(m0, params.m0_max_debug if params.m0_max_debug else m0)
    N = min(N, params.n1_max_debug if params.n1_max_debug else N)
    logger.info(f"using m0 = {m0}, N = {N}")

    samples = generator.sample(n=N)
    logger.info("Sampling done.")
    logger.info(f"Number of samples: {len(samples)}.")
    logger.info(f"Avg. length of samples: {sum(map(len, samples))/len(samples)}.")

    # multiset for initial state is the entire sample
    vertex2multiset[initial_state] = _compute_first_multiset(samples)

    done = False
    iteration = 0
    while not done:
        logger.info(f"Iteration {iteration}")

        candidate_nodes_by_transitions: Dict[Tuple[State, Character], int] = {}
        candidate_nodes_to_transitions: Dict[int, Tuple[State, Character]] = {}
        multisets: Dict[int, Counter] = {}

        for v in vertices:
            for c in alphabet:
                if transitions.get(v, {}).get(c) is None:  # if transition undefined
                    transition = (v, c)
                    new_candidate = len(vertices) + len(candidate_nodes_to_transitions)
                    candidate_nodes_to_transitions[new_candidate] = transition
                    candidate_nodes_by_transitions[transition] = new_candidate
                    multisets[new_candidate] = Counter()

        for s in samples:
            # s is always non-empty
            for i in range(len(s)):
                r, sigma, t = s[:i], s[i], s[i + 1 :]
                q = extended_transition_fun(transitions, r)
                if q is None:
                    continue
                transition = (q, sigma)
                if transition in candidate_nodes_by_transitions:
                    candidate_node = candidate_nodes_by_transitions[transition]
                    multisets[candidate_node].update([tuple(t)])

        chosen_candidate_node, biggest_multiset = max(
            multisets.items(), key=lambda x: sum(x[1].values())
        )
        cardinality = sum(biggest_multiset.values())
        if cardinality >= m0:
            # check if there is a similar vertex
            similar_vertex: Optional[int] = None
            for v in vertices:
                vertex_multiset = vertex2multiset[v]
                norm = l_infty_norm(biggest_multiset, vertex_multiset)
                if norm <= mu / 2.0:
                    similar_vertex = v
                    break

            if similar_vertex is not None:
                transition = candidate_nodes_to_transitions[chosen_candidate_node]
                u, sigma = transition
                transitions.setdefault(u, {})[sigma] = similar_vertex
            else:
                new_node = len(vertices)
                vertices.add(new_node)
                vertex2multiset[new_node] = biggest_multiset
                transition = candidate_nodes_to_transitions.pop(chosen_candidate_node)
                _tmp = candidate_nodes_by_transitions.pop(transition)
                assert chosen_candidate_node == _tmp
                u, sigma = transition
                transitions.setdefault(u, {})[sigma] = new_node

        if cardinality < m0:
            done = True
        iteration += 1

    # complete subgraph
    final_node = FINAL_STATE
    for vertex in vertices:
        transitions.setdefault(vertex, {})[FINAL_SYMBOL] = final_node

    logger.info(f"Vertices: {pprint.pformat(vertices)}")
    logger.info(f"Transitions: {pprint.pformat(transitions)}")
    logger.info(f"Computed final node: {final_node} (no outgoing transitions)")

    return vertices, transitions