def do_iteration(self) -> bool: """ Do one iteration. :return: False if the current iteration failed, else True. """ logger.info(f"Iteration {self.iteration}") done = self._reset_and_check_if_done() if done: return True ( chosen_candidate_node, biggest_multiset, ) = self.compute_multisets_and_get_biggest() if sum(biggest_multiset.values()) == 0: logger.info("Biggest multiset has cardinality 0, done") return True non_distinct_vertices = self._compute_non_distinct_vertices( chosen_candidate_node) self._add_new_state_or_edge(chosen_candidate_node, non_distinct_vertices) # end of iteration self.iteration += 1 done = self.iteration >= self.iteration_upper_bound return done
def _compute_N(params: PalmerParams, m0: int): """Compute N.""" eps = params.epsilon delta = params.delta_1 n = params.n s = params.alphabet_size N1 = 8 * (n ** 2) * (s ** 2) / (eps ** 2) * (log((2 ** (n * s)) * n * s / delta)) N2 = 4 * m0 * n * s / eps N = ceil(max(N1, N2)) logger.info(f"N1 = {N1}, N2 = {N2}. Chosen: {N}") return N
def learn_pdfa(**kwargs) -> PDFA: """ PAC-learn a PDFA. This is a wrapper function to the 'Learner' class, defined below. :param kwargs: the keyword arguments of the algorithm (see the BalleParams class). :return: the learnt PDFA. """ params = BalleParams(**kwargs) logger.info(f"Parameters: {pprint.pformat(str(params))}") automaton = Learner(params).learn() return automaton
def _sample_and_update(self): """Do the sampling.""" logger.info("Generating the sample.") if self.params.sample_generator: generator = self.params.sample_generator samples = generator.sample(n=self.params.nb_samples) samples = list(map(lambda x: tuple(x), samples)) else: samples = self.params.dataset self.average_trace_length = sum(map(len, samples)) / len(samples) logger.info(f"Average trace length: {self.average_trace_length}.") logger.info("Populate root multiset.") self.main_multiset.update(samples)
def learn_pdfa(**kwargs): """ PAC-learn a PDFA. :param kwargs: the keyword arguments of the algorithm (see the PalmerParams class). :return: the learnt PDFA. """ params = PalmerParams(**kwargs) logger.info(f"Parameters: {pprint.pformat(str(params))}") vertices, transitions = learn_subgraph(params) logger.info(f"Number of vertices: {len(vertices)}.") logger.info(f"Transitions: {pprint.pformat(transitions)}.") pdfa = learn_probabilities((vertices, transitions), params) return pdfa
def learn_probabilities(graph: Tuple[Set[int], Dict[int, Dict[int, int]]], params: PalmerParams) -> PDFA: """ Learn the probabilities of the PDFA. :param graph: the learned subgraph of the true PDFA. :param params: the parameters of the algorithms. :return: the PDFA. """ logger.info("Start learning probabilities.") vertices, transitions = graph initial_state = 0 N = _sample_size(params) logger.info(f"Sample size: {N}.") N = min(N, params.n2_max_debug if params.n2_max_debug else N) logger.info(f"Using N = {N}.") generator = params.sample_generator sample = generator.sample(N) n_observations: Counter = Counter() for word in sample: current_state = initial_state for character in word: # update statistics n_observations.update([(current_state, character)]) # compute next state next_state: Optional[int] = transitions.get(current_state, {}).get(character) if next_state is None: break # pragma: no cover current_state = next_state gammas: Dict[int, Dict[int, float]] = {} # compute number of times q is visited q_visits: Counter = Counter() for (q, _), counts in n_observations.items(): q_visits[q] += counts # compute mean for (q, sigma), counts in n_observations.items(): gammas.setdefault(q, {})[sigma] = counts / q_visits[q] # rescale probabilities for _, out_probabilities in gammas.items(): characters, probabilities = zip(*list(out_probabilities.items())) probability_sum = math.fsum(probabilities) new_probabilities = [p / probability_sum for p in probabilities] out_probabilities.update(dict(zip(characters, new_probabilities))) # compute transition function for the PDFA transition_dict: TransitionFunctionDict = {} for q, out_transitions in transitions.items(): transition_dict.setdefault(q, {}) for sigma, q_prime in out_transitions.items(): prob = gammas.get(q, {}).get(sigma, 0.0) transition_dict[q][sigma] = (q_prime, prob) logger.info(f"Computed vertices: {pprint.pformat(vertices)}") logger.info( f"Computed transition dictionary: {pprint.pformat(transition_dict)}") return PDFA(len(vertices), params.alphabet_size, transition_dict)
def learn_subgraph( # noqa: ignore params: PalmerParams, ) -> Tuple[Set[int], Dict[int, Dict[Character, int]]]: """ Learn a subgraph of the true PDFA. :param params: the parameters of the algorithms. :return: the graph """ # unpack parameters generator = params.sample_generator mu = params.mu # initialize variables initial_state = 0 vertices = {initial_state} transitions: Dict[int, Dict[Character, int]] = {} alphabet = set(range(params.alphabet_size)) vertex2multiset: Dict[int, Counter] = {} m0 = _compute_m0(params) N = _compute_N(params, m0) logger.info(f"m0 = {m0}") logger.info(f"N = {N}") m0 = min(m0, params.m0_max_debug if params.m0_max_debug else m0) N = min(N, params.n1_max_debug if params.n1_max_debug else N) logger.info(f"using m0 = {m0}, N = {N}") samples = generator.sample(n=N) logger.info("Sampling done.") logger.info(f"Number of samples: {len(samples)}.") logger.info(f"Avg. length of samples: {sum(map(len, samples))/len(samples)}.") # multiset for initial state is the entire sample vertex2multiset[initial_state] = _compute_first_multiset(samples) done = False iteration = 0 while not done: logger.info(f"Iteration {iteration}") candidate_nodes_by_transitions: Dict[Tuple[State, Character], int] = {} candidate_nodes_to_transitions: Dict[int, Tuple[State, Character]] = {} multisets: Dict[int, Counter] = {} for v in vertices: for c in alphabet: if transitions.get(v, {}).get(c) is None: # if transition undefined transition = (v, c) new_candidate = len(vertices) + len(candidate_nodes_to_transitions) candidate_nodes_to_transitions[new_candidate] = transition candidate_nodes_by_transitions[transition] = new_candidate multisets[new_candidate] = Counter() for s in samples: # s is always non-empty for i in range(len(s)): r, sigma, t = s[:i], s[i], s[i + 1 :] q = extended_transition_fun(transitions, r) if q is None: continue transition = (q, sigma) if transition in candidate_nodes_by_transitions: candidate_node = candidate_nodes_by_transitions[transition] multisets[candidate_node].update([tuple(t)]) chosen_candidate_node, biggest_multiset = max( multisets.items(), key=lambda x: sum(x[1].values()) ) cardinality = sum(biggest_multiset.values()) if cardinality >= m0: # check if there is a similar vertex similar_vertex: Optional[int] = None for v in vertices: vertex_multiset = vertex2multiset[v] norm = l_infty_norm(biggest_multiset, vertex_multiset) if norm <= mu / 2.0: similar_vertex = v break if similar_vertex is not None: transition = candidate_nodes_to_transitions[chosen_candidate_node] u, sigma = transition transitions.setdefault(u, {})[sigma] = similar_vertex else: new_node = len(vertices) vertices.add(new_node) vertex2multiset[new_node] = biggest_multiset transition = candidate_nodes_to_transitions.pop(chosen_candidate_node) _tmp = candidate_nodes_by_transitions.pop(transition) assert chosen_candidate_node == _tmp u, sigma = transition transitions.setdefault(u, {})[sigma] = new_node if cardinality < m0: done = True iteration += 1 # complete subgraph final_node = FINAL_STATE for vertex in vertices: transitions.setdefault(vertex, {})[FINAL_SYMBOL] = final_node logger.info(f"Vertices: {pprint.pformat(vertices)}") logger.info(f"Transitions: {pprint.pformat(transitions)}") logger.info(f"Computed final node: {final_node} (no outgoing transitions)") return vertices, transitions