def test_score_bad_match_same_day_run(self): """ Tests messages with mutually exclusive uids on the same day are scored lowly """ # uid, risk, day, time_received, true sender id current_day = 0 message1 = Message(0, 0, current_day, "human:0") message2 = Message(1, 0, current_day, "human:1") clusters = Clusters() clusters.add_messages([encode_message(message1)], current_day) best_cluster, best_message, best_score = clusters.score_matches( message2, current_day) self.assertEqual(best_score, -1) self.assertEqual(message1, best_message)
def test_add_message_to_cluster_same_cluster_run(self): """ Tests that the add_message_to_cluster function adds messages with the same uid on the same day to the same cluster. """ # make new old message clusters message = Message(0, 0, 0, "human:1") clusters = Clusters() clusters.add_messages([encode_message(message)], 0) # make new message new_message = Message(0, 0, 0, "human:1") # add message to clusters clusters.add_messages([encode_message(new_message)], 0) self.assertEqual(len(clusters), 1)
def test_score_bad_match_one_day_run(self): """ Tests messages with mutually exclusive uids seperated by a day are scored lowly """ # uid, risk, day, true sender id message1 = Message(0, 0, 0, "human:1") message2 = Message(6, 0, 1, "human:1") clusters = Clusters() clusters.add_messages([encode_message(message1)], 0) best_cluster, best_message, best_score = clusters.score_matches( message2, 1) self.assertEqual(best_cluster, 0) self.assertEqual(best_message, message1) self.assertEqual(best_score, -1)
def test_purge(self): """ Tests the purge functionality""" message1 = Message(0, 0, 0, "human:0") message2 = Message(15, 0, 1, "human:0") clusters = Clusters() clusters.add_messages([encode_message(message1)], 0) clusters.add_messages([encode_message(message2)], 0) clusters.purge(13) self.assertEqual(len(clusters), 2) clusters.purge(14) self.assertEqual(len(clusters), 1) clusters.purge(15) self.assertEqual(len(clusters), 0)
def test_score_good_match_same_day_run(self): """ Tests messages with the same uids on the same day are scored highly """ # uid, risk, day, true sender id current_day = 0 message1 = Message(0, 0, current_day, "human:1") message2 = Message(0, 0, current_day, "human:1") clusters = Clusters() clusters.add_messages([encode_message(message1)], current_day) best_cluster, best_message, best_score = clusters.score_matches( message2, current_day) self.assertEqual(best_cluster, 0) self.assertEqual(best_message, message1) self.assertEqual(best_score, 3)
def test_add_message_to_cluster_new_cluster_run(self): """ Tests messages with mutually exclusive uids on the same day are scored lowly """ # make new old message clusters message = Message(0, 0, 0, "human:1") clusters = Clusters() clusters.add_messages([encode_message(message)], 0) # make new message new_message = Message(1, 0, 0, "human:1") # add message to clusters clusters.add_messages([encode_message(new_message)], 0) num_clusters = len(clusters) self.assertEqual(num_clusters, 2)
def test_score_good_match_one_day_run(self): """ Tests messages with similar uids on the different day are scored mediumly """ # uid, risk, day, true sender id current_day = 0 clusters = Clusters() message1 = Message(0, 0, 0, "human:1") clusters.add_messages([encode_message(message1)], current_day) message2 = Message(1, 0, 1, "human:1") best_cluster, best_message, best_score = clusters.score_matches( message2, 1) self.assertEqual(best_cluster, 0) self.assertEqual(best_message, message1) self.assertEqual(best_score, 2)
def update_records(self, update_messages, human): if not update_messages: return self grouped_update_messages = self.group_by_received_at(update_messages) for received_at, update_messages in grouped_update_messages.items(): # num days x num clusters cluster_cards = np.zeros((max(self.clusters_by_day.keys())+1, max(self.clusters.keys())+1)) update_cards = np.zeros((max(self.clusters_by_day.keys())+1, 1)) # figure out the cardinality of each day's message set for day, clusters in self.clusters_by_day.items(): for cluster_id, messages in clusters.items(): cluster_cards[day][cluster_id] = len(messages) for update_message in update_messages: update_cards[update_message.day] += 1 # find the nearest cardinality cluster perfect_signatures = np.where((cluster_cards == update_cards).all(axis=0))[0] if not any(perfect_signatures): # calculate the wasserstein distance between every signature scores = [] for cluster_idx in range(cluster_cards.shape[1]): scores.append(dist(cluster_cards[:, cluster_idx], update_cards.reshape(-1))) best_cluster = int(np.argmin(scores)) # for each day for day in range(len(update_cards)): cur_cardinality = int(cluster_cards[day, best_cluster]) target_cardinality = int(update_cards[day]) # if (and while) the cardinality is not what it should be, as determined by the update_messages while cur_cardinality - target_cardinality != 0: # print(f"day: {day}, cur_cardinality: {cur_cardinality}, target_cardinality: {target_cardinality}") # if we need to remove messages from this cluster on this day, if cur_cardinality > target_cardinality: best_score = -1 best_message = None new_cluster_id = None # then for each message in that day/cluster, for message in self.clusters_by_day[day][best_cluster]: for cluster_id, messages in self.clusters_by_day[day].items(): if cluster_id == best_cluster: continue # and for each alternative cluster on that day for candidate_cluster_message in messages: # check if it's a good cluster to move this message to score = self.score_two_messages(decode_message(candidate_cluster_message), message) if (score > best_score or not best_message): best_message = message new_cluster_id = cluster_id # if there are no other clusters on that day make a new cluster if not best_message: best_message = message message = decode_message(message) new_cluster_id = hash_to_cluster(message) best_message = decode_message(best_message) # for the message which best fits another cluster, move it there self.update_record(best_cluster, new_cluster_id, best_message, best_message) cur_cardinality -= 1 # print(f"removing from cluster {best_cluster} to cluster {new_cluster_id} on day {day}") #otherwise we need to add messages to this cluster/day else: # so look for messages which closely match our update messages, and add them for update_message in update_messages: if update_message.day == day: break best_score = -2 best_message = None old_cluster_id = None for cluster_id, messages in self.clusters_by_day[day].items(): for message in messages: score = self.score_two_messages(update_message, message) if (score > best_score and cluster_id != best_cluster): best_message = message old_cluster_id = cluster_id best_message = decode_message(best_message) updated_message = Message(best_message.uid, update_message.new_risk, best_message.day, best_message.unobs_id) # print(f"adding from cluster {old_cluster_id} to cluster {best_cluster} on day {day}") self.update_record(old_cluster_id, best_cluster, best_message, updated_message) cur_cardinality += 1 else: best_cluster = self.score_clusters(update_messages, perfect_signatures) for update_message in update_messages: best_score = -1 best_message = self.clusters_by_day[update_message.day][best_cluster][0] for risk_message in self.clusters_by_day[update_message.day][best_cluster]: score = self.score_two_messages(update_message, risk_message) if score > best_score: best_message = risk_message best_message = decode_message(best_message) updated_message = Message(best_message.uid, update_message.new_risk, best_message.day, best_message.unobs_id) self.update_record(best_cluster, best_cluster, best_message, updated_message) return self
def cur_message(self, day, RiskModel): """creates the current message for this user""" message = Message(self.uid, RiskModel.quantize_risk(self.risk), day, self.name) return message