Ejemplo n.º 1
0
 def test_score_bad_match_same_day_run(self):
     """
     Tests messages with mutually exclusive uids on the same day are scored lowly
     """
     # uid, risk, day, time_received, true sender id
     current_day = 0
     message1 = Message(0, 0, current_day, "human:0")
     message2 = Message(1, 0, current_day, "human:1")
     clusters = Clusters()
     clusters.add_messages([encode_message(message1)], current_day)
     best_cluster, best_message, best_score = clusters.score_matches(
         message2, current_day)
     self.assertEqual(best_score, -1)
     self.assertEqual(message1, best_message)
Ejemplo n.º 2
0
    def test_add_message_to_cluster_same_cluster_run(self):
        """
        Tests that the add_message_to_cluster function adds messages with the same uid on the same day to the same cluster.
        """
        # make new old message clusters
        message = Message(0, 0, 0, "human:1")
        clusters = Clusters()
        clusters.add_messages([encode_message(message)], 0)

        # make new message
        new_message = Message(0, 0, 0, "human:1")
        # add message to clusters
        clusters.add_messages([encode_message(new_message)], 0)
        self.assertEqual(len(clusters), 1)
Ejemplo n.º 3
0
 def test_score_bad_match_one_day_run(self):
     """
     Tests messages with mutually exclusive uids seperated by a day are scored lowly
     """
     # uid, risk, day, true sender id
     message1 = Message(0, 0, 0, "human:1")
     message2 = Message(6, 0, 1, "human:1")
     clusters = Clusters()
     clusters.add_messages([encode_message(message1)], 0)
     best_cluster, best_message, best_score = clusters.score_matches(
         message2, 1)
     self.assertEqual(best_cluster, 0)
     self.assertEqual(best_message, message1)
     self.assertEqual(best_score, -1)
Ejemplo n.º 4
0
    def test_purge(self):
        """ Tests the purge functionality"""
        message1 = Message(0, 0, 0, "human:0")
        message2 = Message(15, 0, 1, "human:0")
        clusters = Clusters()
        clusters.add_messages([encode_message(message1)], 0)
        clusters.add_messages([encode_message(message2)], 0)

        clusters.purge(13)
        self.assertEqual(len(clusters), 2)
        clusters.purge(14)
        self.assertEqual(len(clusters), 1)
        clusters.purge(15)
        self.assertEqual(len(clusters), 0)
Ejemplo n.º 5
0
 def test_score_good_match_same_day_run(self):
     """
     Tests messages with the same uids on the same day are scored highly
     """
     # uid, risk, day, true sender id
     current_day = 0
     message1 = Message(0, 0, current_day, "human:1")
     message2 = Message(0, 0, current_day, "human:1")
     clusters = Clusters()
     clusters.add_messages([encode_message(message1)], current_day)
     best_cluster, best_message, best_score = clusters.score_matches(
         message2, current_day)
     self.assertEqual(best_cluster, 0)
     self.assertEqual(best_message, message1)
     self.assertEqual(best_score, 3)
Ejemplo n.º 6
0
    def test_add_message_to_cluster_new_cluster_run(self):
        """
        Tests messages with mutually exclusive uids on the same day are scored lowly
        """
        # make new old message clusters
        message = Message(0, 0, 0, "human:1")
        clusters = Clusters()
        clusters.add_messages([encode_message(message)], 0)

        # make new message
        new_message = Message(1, 0, 0, "human:1")
        # add message to clusters

        clusters.add_messages([encode_message(new_message)], 0)
        num_clusters = len(clusters)
        self.assertEqual(num_clusters, 2)
Ejemplo n.º 7
0
    def test_score_good_match_one_day_run(self):
        """
        Tests messages with similar uids on the different day are scored mediumly
        """
        # uid, risk, day, true sender id
        current_day = 0
        clusters = Clusters()
        message1 = Message(0, 0, 0, "human:1")
        clusters.add_messages([encode_message(message1)], current_day)
        message2 = Message(1, 0, 1, "human:1")

        best_cluster, best_message, best_score = clusters.score_matches(
            message2, 1)
        self.assertEqual(best_cluster, 0)
        self.assertEqual(best_message, message1)
        self.assertEqual(best_score, 2)
Ejemplo n.º 8
0
    def update_records(self, update_messages, human):
        if not update_messages:
            return self
        grouped_update_messages = self.group_by_received_at(update_messages)
        for received_at, update_messages in grouped_update_messages.items():

            # num days x num clusters
            cluster_cards = np.zeros((max(self.clusters_by_day.keys())+1,  max(self.clusters.keys())+1))
            update_cards = np.zeros((max(self.clusters_by_day.keys())+1, 1))

            # figure out the cardinality of each day's message set
            for day, clusters in self.clusters_by_day.items():
                for cluster_id, messages in clusters.items():
                    cluster_cards[day][cluster_id] = len(messages)

            for update_message in update_messages:
                update_cards[update_message.day] += 1

            # find the nearest cardinality cluster
            perfect_signatures = np.where((cluster_cards == update_cards).all(axis=0))[0]
            if not any(perfect_signatures):
                # calculate the wasserstein distance between every signature
                scores = []
                for cluster_idx in range(cluster_cards.shape[1]):
                    scores.append(dist(cluster_cards[:, cluster_idx], update_cards.reshape(-1)))
                best_cluster = int(np.argmin(scores))

                # for each day
                for day in range(len(update_cards)):
                    cur_cardinality = int(cluster_cards[day, best_cluster])
                    target_cardinality = int(update_cards[day])

                    # if (and while) the cardinality is not what it should be, as determined by the update_messages
                    while cur_cardinality - target_cardinality != 0:
                        # print(f"day: {day}, cur_cardinality: {cur_cardinality}, target_cardinality: {target_cardinality}")
                        # if we need to remove messages from this cluster on this day,
                        if cur_cardinality > target_cardinality:
                            best_score = -1
                            best_message = None
                            new_cluster_id = None

                            # then for each message in that day/cluster,
                            for message in self.clusters_by_day[day][best_cluster]:
                                for cluster_id, messages in self.clusters_by_day[day].items():
                                    if cluster_id == best_cluster:
                                        continue

                                    # and for each alternative cluster on that day
                                    for candidate_cluster_message in messages:
                                        # check if it's a good cluster to move this message to
                                        score = self.score_two_messages(decode_message(candidate_cluster_message), message)
                                        if (score > best_score or not best_message):
                                            best_message = message
                                            new_cluster_id = cluster_id

                            # if there are no other clusters on that day make a new cluster
                            if not best_message:
                                best_message = message
                                message = decode_message(message)
                                new_cluster_id = hash_to_cluster(message)
                            best_message = decode_message(best_message)

                            # for the message which best fits another cluster, move it there
                            self.update_record(best_cluster, new_cluster_id, best_message, best_message)
                            cur_cardinality -= 1
                            # print(f"removing from cluster {best_cluster} to cluster {new_cluster_id} on day {day}")

                        #otherwise we need to add messages to this cluster/day
                        else:
                            # so look for messages which closely match our update messages, and add them
                            for update_message in update_messages:
                                if update_message.day == day:
                                    break
                            best_score = -2
                            best_message = None
                            old_cluster_id = None
                            for cluster_id, messages in self.clusters_by_day[day].items():
                                for message in messages:
                                    score = self.score_two_messages(update_message, message)
                                    if (score > best_score and cluster_id != best_cluster):
                                        best_message = message
                                        old_cluster_id = cluster_id

                            best_message = decode_message(best_message)
                            updated_message = Message(best_message.uid, update_message.new_risk, best_message.day, best_message.unobs_id)
                            # print(f"adding from cluster {old_cluster_id} to cluster {best_cluster} on day {day}")
                            self.update_record(old_cluster_id, best_cluster, best_message, updated_message)
                            cur_cardinality += 1
            else:
                best_cluster = self.score_clusters(update_messages, perfect_signatures)
            for update_message in update_messages:
                best_score = -1
                best_message = self.clusters_by_day[update_message.day][best_cluster][0]
                for risk_message in self.clusters_by_day[update_message.day][best_cluster]:
                    score = self.score_two_messages(update_message, risk_message)
                    if score > best_score:
                        best_message = risk_message
                best_message = decode_message(best_message)
                updated_message = Message(best_message.uid, update_message.new_risk, best_message.day, best_message.unobs_id)
                self.update_record(best_cluster, best_cluster, best_message, updated_message)
        return self
 def cur_message(self, day, RiskModel):
     """creates the current message for this user"""
     message = Message(self.uid, RiskModel.quantize_risk(self.risk), day,
                       self.name)
     return message