Beispiel #1
0
    def test_contains_int(self):
        b = Bureau(User)
        agent = User(15)
        b.add(agent)

        assert 15 in b
        assert 16 not in b
Beispiel #2
0
    def test_add_agent(self):
        b = Bureau(User)
        agent = User(0)
        b.add(agent)

        assert agent in b
        assert 0 in b._agents
        assert b.get(0) == agent
    def __init__(self, t_low, t_high, export=None):
        golds = gold_0 + gold_1
        self.golds = db.getExpertGold(golds)
        self.silver = self.golds.copy()
        self.t_low = t_low
        self.t_high = t_high

        self.bureau = Bureau(Bootstrap_Subject)

        self.metrics = Metrics()

        self.n = 0
Beispiel #4
0
    def __init__(self):
        """
            Initialize SWAP instance
            Args:
                p0: Prior probability real - in general this is derived
                empirically by considering the occurence frequency of
                interesting objects that are expertly identified within a
                fiducial dataset. It is required to initialize the likelihood
                formulation framework for each subject prior to reception of
                the first volunteer classification.

                epsilon: Estimated volunteer performance - This is either
                set arbitrarily or might be based upon judicious assesment of
                cohort-wide volunteer performance on a similar analysis task.
                It is required to initialize the likelihood formulation
                framework for each volunteer's agent.
        """

        # initialize bureaus to manage user / subject agents
        self.users = Bureau(User)
        self.subjects = Bureau(Subject)
Beispiel #5
0
    def test_notify(self, mock):
        l = Ledger(15)
        t = Transaction(mock(), 0)
        print(mock().id)
        l.add(t)
        l.clear_changes()

        l.notify(16, Bureau(Subject))

        t.notify.assert_called_once_with(mock())
        assert l.stale is True
        assert l.changed == [16]
Beispiel #6
0
    def test_del_agent(self):
        b = Bureau(User)
        agent = User(0)
        b.add(agent)
        b.remove(0)

        assert agent not in b
        assert 0 not in b._agents
Beispiel #7
0
    def test_bureau_stats(self):
        b = Bureau(User)
        b.add(User(0))
        b.add(User(1))
        b.add(User(2))

        for u in b:
            u.ledger.recalculate()

        s = b.stats()
        assert 0 in s.stats
        assert 1 in s.stats
        assert len(s.stats) == 2
Beispiel #8
0
    def test_export_contents(self):
        b = Bureau(User)
        for i in range(10):
            b.add(User(i))

        export = b.export()
        pprint(export)

        assert b.export()[0] == User(0).export()
        assert b.export()[5] == User(5).export()
Beispiel #9
0
 def test_get_agent_none(self):
     b = Bureau(User)
     assert b.get(0, make_new=False) is None
Beispiel #10
0
    def test_get_newagent_isadded(self):
        b = Bureau(User)
        u = b.get(15)

        assert u in b
        assert b.get(15) == u
Beispiel #11
0
 def test_stats_subject(self):
     b = Bureau(Subject)
     [b.add(Subject(i)) for i in range(5)]
     [s.ledger.recalculate() for s in b]
     b.stats()
Beispiel #12
0
class SWAP:
    """
        SWAP implementation, which calculates and updates a confusion matrix
        for each user as well as the probability that a particular subject
        contains an object of interest.

        See: Marshall et al. 2016: "Space Warps I: Crowd-sourcing the
        Discovery of Gravitational Lenses", MNRAS, 455, 1171
        (hereafter Marshall et al. 2016) for algorithm explanation.
    """
    def __init__(self):
        """
            Initialize SWAP instance
            Args:
                p0: Prior probability real - in general this is derived
                empirically by considering the occurence frequency of
                interesting objects that are expertly identified within a
                fiducial dataset. It is required to initialize the likelihood
                formulation framework for each subject prior to reception of
                the first volunteer classification.

                epsilon: Estimated volunteer performance - This is either
                set arbitrarily or might be based upon judicious assesment of
                cohort-wide volunteer performance on a similar analysis task.
                It is required to initialize the likelihood formulation
                framework for each volunteer's agent.
        """

        # initialize bureaus to manage user / subject agents
        self.users = Bureau(User)
        self.subjects = Bureau(Subject)

        # Directive to update - if True, then a volunteer agent's posterior
        # probability of containing an interesting object will be updated
        # whenever an expertly classified "gold standard" subject is
        # classified by that volunteer.
        # self.gold_updates = True

        # Directive to use gold labels from classification
        # if true, assigns the gold_label from the classification
        # whenever a new subject is created
        #
        # Useful to ignore gold_labels when doing a test/train split
        # without properly sanitizing gold labels from classifications
        # self.gold_from_cl = False

    # Process a classification
    def classify(self, cl, subject=None, user=None):
        """
            Process a classification

            Parameters
            ----------
            cl : swap.utils.classification.Classification, dict
                Classification to be processed. Should be a Classification
                object, but will also accept a dict object to generate a
                new Classification object
            subject : boolean
                Deprecated
            user : boolean
                Deprecated
        """
        # if subject is gold standard and gold_updates are specified,
        # update user success probability

        if not isinstance(cl, Classification):
            cl = Classification.generate(cl)

        if subject is not None or user is not None:
            raise DeprecationWarning('controlling subject and user are ' +
                                     'no longer supported')

        subject = self.subjects.get(cl.subject)
        user = self.users.get(cl.user)

        if not config.back_update:
            user.ledger.recalculate()

        subject.classify(cl, user)
        user.classify(cl, subject)

        # if not config.back_update:
        #     self.process_changes()

    # def _classify_user(self, cl):
    #     """
    #         Gets the appropriate user and

    #         Parameters
    #         ----------
    #         cl: Classification
    #     """

    #     user = self.users.get(cl.user)
    #     subject = self.subjects.get(cl.subject)

    #     user.classify(cl, subject)

    # def _classify_subject(self, cl):
    #     """
    #         Pass a classification to the appropriate subject agent

    #         Parameters
    #         ----------
    #         cl : (dict) classification
    #     """

    #     # Get subject and user agents
    #     user = self.users.get(cl.user)
    #     subject = self.subjects.get(cl.subject)
    #     # process the classification
    #     subject.classify(cl, user)

    def process_changes(self):
        """
        Process changes to agent ledgers

        While classifying, scores are calculated, they are merely added to
        the ledger structures. Here the changes are committed and the new
        scores are calculated. This reduces processing time as the subject
        score calculation is dependent on the user confusion matrix. The
        user's confusion matrix is subject to change depending on the
        user's performance on gold standard subjects.

        First the user's confusion matrices are calculated based on their
        performance classifying gold standard subjects. If a user's scores
        have changed, then it notifies every subject agent it classified on
        of this change.

        Then any subject agent which is connected to a user whose score has
        changed recalculates its score.
        """

        with_bar = config.back_update

        # TODO make sure notify_agents is called on each ledger

        def run(bureau):
            if with_bar:
                name = bureau.agent_type.class_name
                logger.info('processing %s score changes', name)
                with progressbar.ProgressBar(
                        max_value=bureau.calculate_changes()) as bar:
                    bar.update(0)
                    bureau.process_changes(bar)
                logger.info('done')

            else:
                bureau.process_changes()

        logger.info('Notifying user agents of subject changes')
        self.subjects.notify_changes(self.users)

        run(self.users)

        logger.info('Notifying subject agents of user changes')
        self.users.notify_changes(self.subjects)

        run(self.subjects)

        # logger.info('processing user score changes')
        # with progressbar.ProgressBar(
        #         max_value=self.users.calculate_changes()) as bar:
        #     bar.update(0)
        #     self.users.process_changes(bar)
        # logger.info('done')

        # logger.info('processing subject score changes')
        # with progressbar.ProgressBar(
        #         max_value=self.subjects.calculate_changes()) as bar:
        #     bar.update(0)
        #     self.subjects.process_changes(bar)
        # logger.info('done')

    # def getUserAgent(self, user_id):
    #     """
    #         Get a User agent from the Bureau. Creates a new one
    #         if it doesn't exist

    #         Args:
    #             agent_id: id for the user
    #     """

    #     # TODO should the bureau generate a new agent, or should
    #     # that be handled here..?
    #     if user_id in self.users:
    #         return self.users.getAgent(user_id)
    #     else:
    #         user = User(user_id, self.epsilon)
    #         self.users.addAgent(user)
    #         return user

    # def getSubjectAgent(self, id_, cl=None):
    #     """
    #         Get a Subject agent from the Bureau. Creates a new one
    #         if it doesn't exist

    #         Args:
    #             agent_id: id for the subject
    #     """

    #     if id_ in self.subjects:
    #         return self.subjects.getAgent(id_)
    #     else:
    #         subject = Subject(id_, self.p0)
    #         if self.gold_from_cl and cl.isGold():
    #             subject.set_gold_label(cl.gold)

    #         self.subjects.addAgent(subject)
    #         return subject

    # def getUserData(self):
    #     """ Get User Bureau object """
    #     return self.users

    # def getSubjectData(self):
    #     """ Get Subject Bureau object """
    #     return self.subjects

    def set_gold_labels(self, golds, with_bar=True):
        """
            Defines the subjects explicitly that should be
            treated as gold standards

            Note: To get proper test/train split, the gold_labels
            still need to be stripped out of the classification dicts.
            This function is for defining all subjects that are
            gold on initialization

            Parameters
            ----------
            golds : dict
                (subject id : gold label) Mapping of subject to its gold label
        """
        # Removes gold label from all subjects not in the golds list
        logger.info('Processing gold labels')
        if with_bar:
            bar = progressbar.ProgressBar(max_value=len(self.subjects))
        for subject in self.subjects:
            if subject.id not in golds:
                subject.set_gold_label(-1, self.subjects, self.users)

            if with_bar:
                bar.update(bar.value + 1)
        # Assigns the new gold label to subjects in the list
        # Also tells the Bureau to make a new subject agent if it
        # doesn't exist yet
        for id_, gold in golds.items():
            subject = self.subjects.get(id_, make_new=True)
            subject.set_gold_label(gold, self.subjects, self.users)

        # self.process_changes()

    @property
    def golds(self):
        """
        Compile a list of all the subject -> gold mappings being used

        Returns
        -------
        dict
            {subject id: gold label}
        """
        data = {}
        for subject in self.subjects:
            if subject.isgold():
                data[subject.id] = subject.gold

        return data

    # ----------------------------------------------------------------

    @property
    def stats(self):
        """
            Consolidate all the statistical data from the bureaus

            Returns
            -------
            swap.agents.agent.Stats
                Stats object containing statistical data on the
                confusion matrices and subject scores
        """
        stats = Stats()
        if len(self.users) > 0:
            stats.add('user', self.users.stats())
        if len(self.subjects) > 0:
            stats.add('subject', self.subjects.stats())

        return stats

    def stats_str(self):
        """
            Consolidate all the statistical data from the bureaus
            into a string

            Returns
            -------
            str
                Stats to string
        """
        return str(self.stats)

    # def exportUserData(self):
    #     """ Exports consolidated user information """
    #     return self.users.export()

    # def exportSubjectData(self):
    #     """ Exports consolidated subject information """
    #     return self.subjects.export()

    def export(self):
        """
            Export both user and subject data

            Deprecated
        """
        raise DeprecationWarning
        return {
            'users': self.users.export(),
            'subjects': self.subjects.export(),
            'stats': self.stats.export()
        }

    def score_export(self, history=None):
        """
        Generate object containing subject score data

        Used in most of our plotting functions and other analysis tools

        Returns
        -------
        swap.utils.scores.ScoreExport
            ScoreExport
        """
        if history is None:
            history = self.history_export()

        logger.info('Generating score export')
        scores = {}
        for subject in self.subjects:
            if len(subject.ledger) == 0:
                continue
            id_ = subject.id
            score = subject.score
            scores[id_] = Score(id_, None, score)

        logger.debug('done')
        return ScoreExport(scores, history=history)

    def history_export(self):
        """
        Genearte object containing subject score history

        Returns
        -------
        swap.utils.history.HistoryExport
            HistoryExport
        """
        logger.info('Generating history export')
        history = {}
        for subject in self.subjects:
            if len(subject.ledger) == 0:
                continue

            # Generate list of subject scores
            scores = [config.p0]
            for t in sorted(subject.ledger, key=lambda t: t.order):
                scores.append(t.score)

            # Create History object
            id_ = subject.id
            history[id_] = History(id_, subject.gold, scores)

        logger.debug('done')
        return HistoryExport(history)

    def debug_str(self):
        s = ''
        for u in self.users:
            s += 'user %s\n' % str(u.id)
            s += '%s\n' % str(u.ledger)
        for a in self.subjects:
            s += 'subject %s gold %d\n' % (str(a.id), a.gold)
            s += '%s\n' % str(a.ledger)
        return s

    def manifest(self):
        """
        Generates a text manifest. Contains relevant information on the
        bootstrap run, including whatever parameters were used, and
        statistical information on each run.
        """
        def countGolds():
            golds = [0, 0, 0]
            for subject in self.subjects:
                golds[subject.gold] += 1

            return tuple(golds)

        s = ''
        s += 'SWAP manifest\n'
        s += '=============\n'
        s += 'p0:         %f\n' % config.p0
        s += 'epsilon:    %f\n' % config.epsilon
        s += '\n'
        s += 'n golds:    %d %d %d\n' % countGolds()
        s += '\n'
        s += 'Statistics\n'
        s += '==========\n'
        s += str(self.stats) + '\n'

        return s
Beispiel #13
0
 def test_contains_true(self):
     b = Bureau(User)
     agent = User(15)
     b.add(agent)
     assert agent in b
Beispiel #14
0
 def test_add_agent_twice_nonew(self):
     b = Bureau(User)
     agent = User(0)
     b.add(agent)
     with pytest.raises(KeyError):
         b.add(agent, override=False)
Beispiel #15
0
 def test_add_agent_typecheck(self):
     b = Bureau(User)
     agent = Subject(0)
     with pytest.raises(TypeError):
         b.add(agent)
Beispiel #16
0
 def test_init(self):
     b = Bureau(Agent)
     assert b.agent_type is Agent
     assert b._agents == {}
Beispiel #17
0
    def get_bureau(self):
        b = Bureau(Subject)
        for i in range(5):
            b.add(Subject(i))

        return b
Beispiel #18
0
 def test_stats_users(self):
     b = Bureau(User)
     [b.add(User(i)) for i in range(5)]
     [u.ledger.recalculate() for u in b]
     b.stats()
Beispiel #19
0
    def test_has_true(self):
        b = Bureau(User)
        agent = User(0)
        b.add(agent)

        assert agent in b
Beispiel #20
0
    def test_has_false(self):
        b = Bureau(User)

        assert 0 not in b
Beispiel #21
0
 def test_add_agent_twice_new(self):
     b = Bureau(User)
     agent = User(0)
     b.add(agent)
     b.add(agent, override=True)
Beispiel #22
0
    def test_notify_agents_getsagent(self, mock):
        l = Ledger(100)
        _ = [l.add(Transaction(mocksubject(i), 0)) for i in range(5)]

        l.notify_agents(None, Bureau(User))
        assert mock.call_count == 5
Beispiel #23
0
 def test_contains_false(self):
     b = Bureau(User)
     agent = User(15)
     assert agent not in b
Beispiel #24
0
    def test_get_agent(self):
        b = Bureau(User)
        agent = User(0)
        b.add(agent)

        assert b.get(0) == agent
Beispiel #25
0
    def test_get_agent_new(self):
        b = Bureau(User)
        u = b.get(15)

        assert type(u) is User
        assert u.id == 15
class Bootstrap:

    def __init__(self, t_low, t_high, export=None):
        golds = gold_0 + gold_1
        self.golds = db.getExpertGold(golds)
        self.silver = self.golds.copy()
        self.t_low = t_low
        self.t_high = t_high

        self.bureau = Bureau(Bootstrap_Subject)

        self.metrics = Metrics()

        self.n = 0

    def setThreshold(self, low, high):
        self.t_low = low
        self.t_high = high

    def step(self):
        self.n += 1

        control = self.gen_control()
        control.run()

        swap = control.getSWAP()
        export = swap.export()

        self.silver_update(swap.export())
        self.update_tracking(export)

        self.addMetric(swap)

        return swap

    def update_tracking(self, export):
        bureau = self.bureau
        for subject, item in export['subjects'].items():
            if subject in bureau:
                agent = bureau.getAgent(subject)
            else:
                agent = Bootstrap_Subject(subject)
                bureau.addAgent(agent)

            agent.add(item['score'])

    def silver_update(self, export):
        silver = {}
        low = self.t_low
        high = self.t_high

        for subject, item in export['subjects'].items():
            if subject not in self.golds:
                if item['score'] < low:
                    silver[subject] = 0
                elif item['score'] > high:
                    silver[subject] = 1

        for subject, gold in self.golds.items():
            silver[subject] = gold

        self.silver = silver
        return silver

    def gen_control(self):
        return BootstrapControl(self.silver.items())

    def export(self):
        return self.bureau.export()

    def roc_export(self, i=None, labels=None):
        """
        Exports list of tuples (gold, score). Useful when
        generating roc curves or evaluating performance

        i: (int) round number (0 based)
        labels: (list) list of subject identifiers. Use when limiting
                which subjects are in export
        """

        # Get real gold labels from database
        bureau = self.bureau
        golds = db.getExpertGold()

        for id_, gold in golds.items():
            if id_ in bureau:
                bureau.getAgent(id_).gold = gold

        data = []
        if labels:
            iter_ = bureau.iter_ids(labels)
        else:
            iter_ = bureau
        for s in iter_:
            if s.gold != -1:
                if i is None:
                    data.append((s.gold, s.score))
                else:
                    data.append((s.gold, s.getHistory()[i]))

        return data

    def addMetric(self, swap):
        """
        Creates a new metric for the current round from SWAP

        swap: (SWAP)
        """
        metric = Metric(self, swap, self.n)
        self.metrics.addMetric(metric)

    def getMetric(self, i):
        """
        Gets a metric for a round

        i: (int) round number of metric (0 based)
        """
        return self.metrics.get(i)

    def printMetrics(self):
        for m in self.metrics:
            print(m.num_silver())

    def manifest(self):
        """
        Generates a text manifest. Contains relevant information on the
        bootstrap run, including whatever parameters were used, and
        statistical information on each run.
        """
        s = ''
        s += 'p0:         %f\n' % config.p0
        s += 'epsilon     %f\n' % config.epsilon
        s += 'iterations: %d\n' % self.n
        s += 'thresholds: %f < p < %f\n' % (self.t_low, self.t_high)
        s += '\n'
        s += str(self.metrics)

        return s
Beispiel #27
0
 def test_idset(self):
     b = Bureau(User)
     [b.add(User(i)) for i in range(5)]
     assert b.idset() == set(range(5))