def _calc_embeddings( self, model_traces: List[List[str]], real_traces: List[List[str]] ) -> Tuple[np.ndarray, np.ndarray, Any]: """Calculates the embeddings of the traces. :param model_traces: The traces coming from the model. :param real_traces: The traces coming from the real log. :return: Dicts for the model and real log contains index of activities and its frequencies and a distance matrix for Euclidean distances of every two actives in all traces. """ emb_gen = ActivityEmbeddingGenerator( model_traces + real_traces, act2vec_windows_size=self.window_size, num_ns=self.num_negative, auto_train=False, num_epochs=self.num_epochs, batch_size=self.batch_size, embedding_size=self.embedding_size, ) # start to train the models emb_gen.start_training() model_embedding, real_embedding, context = emb_gen.get_activity_embedding( model_traces, real_traces, norm=True) dist_matrix = calc_euclidean(context) model_embedding = _calc_d(model_embedding, len(dist_matrix)) real_embedding = _calc_d(real_embedding, len(dist_matrix)) # return frequency tables for the model log and the real log # and an embedding lookup table return model_embedding, real_embedding, dist_matrix
def test_wmd(): """Is the wmd calculated correctly?""" model_embedding = [{0: 1, 1: 1}] real_embedding = [{1: 1}] context = np.array([[1, 4], [5, 1]]) # calculate Euclidean distance matrix distance_matrix = calc_euclidean(context) # calc d for embeddings vocab_len = len(context) d_model = _calc_d(model_embedding, vocab_len) d_real = _calc_d(real_embedding, vocab_len) assert calc_wmd(d_model[0], d_real[0], distance_matrix) == pytest.approx(2.5)
def main(): # create some embeddings as example # (int, int, ...): int = # embedding of a activity: count of this activity within a trace model_embedding = [{0: 3, 1: 1, 2: 2}] real_embedding = [{0: 2}] context = np.array([[0.4, 0.3], [0.2, 0.6], [0.5, 0.9]]) # calculate Euclidean distance matrix distance_matrix = calc_euclidean(context) # calc d for embeddings vocab_len = len(context) d_model = _calc_d(model_embedding, vocab_len) d_real = _calc_d(real_embedding, vocab_len) # calculate WMD between these two traces print("WMD: ", calc_wmd(d_model[0], d_real[0], distance_matrix)) print("ICT: ", calc_ict(d_model[0], d_real[0], distance_matrix))