Beispiel #1
0
    def get_classifications(self):
        logger.info('Loading dual cursors of dump and caesar classifications')

        cursor1 = DB().classifications.getClassifications()
        cursor2 = DB().caesar.getClassifications()

        return DualCursor(cursor1, cursor2)
Beispiel #2
0
    def test_classify_rejects_reclassify(self, run):
        DB._reset()
        oc = control.OnlineControl()
        oc.init_swap()
        ret = oc.classify(self.mock_classification)

        assert ret is None
Beispiel #3
0
    def test_classify(self, run):
        DB._reset()
        oc = control.OnlineControl()
        oc.init_swap()
        ret = oc.classify(self.mock_classification)

        assert isinstance(ret, swap.agents.subject.Subject)
        assert ret.score == 0.12
Beispiel #4
0
    def run(self, amount=None):
        def _amt(stats):
            return stats['first_classifications']

        if amount is None:
            amount = _amt(DB().classifications.get_stats())
            amount += _amt(DB().caesar._gen_stats(upload=False))

        super().run(amount=amount)
Beispiel #5
0
    def test_length(self):
        DB._reset()
        db = DB()._db
        query = [{'$limit': 5}]

        c = Cursor(query, db.classifications)
        print(c.next())

        assert len(c) == 5
Beispiel #6
0
    def call(self, args):
        """
        Define what to do if this interface's command was passed
        """

        if args.upload_dump:
            fname = args.upload_dump[0]
            DB().classifications.upload_project_dump(fname)

        if args.upload_golds:
            fname = args.upload_golds[0]
            DB().golds.upload_golds_csv(fname)

        if args.gen_stats:
            DB().classifications._gen_stats()
Beispiel #7
0
    def _init_subjects(self, golds):
        subjects = {}
        for id_, gold in golds.items():
            stats = SubjectStats.from_static(id_, DB())
            subjects[id_] = self.Subject(id_, gold, stats)

        return subjects
Beispiel #8
0
    def get_cursor():
        """
        Generate a cursor with classifications

        Returns
        -------
        swap.db.Cursor
            Classifications
        """
        cursor = DB().classifications.aggregate([{
            '$match': {
                'gold_label': {
                    '$ne': -1
                }
            }
        }, {
            '$group': {
                '_id': '$subject_id',
                'gold': {
                    '$first': "$gold_label"
                },
                'total': {
                    '$sum': 1
                },
                'votes': {
                    '$sum': "$annotation"
                }
            }
        }])

        return cursor
Beispiel #9
0
    def upload_data(data):
        db = DB()
        requests = []

        def write():
            nonlocal requests
            print('writing')
            if len(requests) > 0:
                db.subjects.bulk_write(requests)
                requests = []

        i = 0
        print(i)
        for subject, metadata in data.items():
            r = db.subjects.update_metadata(subject, metadata, False)
            requests.append(r)

            if i % 10000:
                sys.stdout.flush()
                sys.stdout.write('%d\r' % i)
            if len(requests) > 1e5:
                write()
                requests = []

            i += 1
        write()
Beispiel #10
0
    def test_batch_size(self, mock):
        DB().classifications.getClassifications(batch_size=50)

        args, kwargs = mock.call_args
        print(args, kwargs)
        assert 'batchSize' in args[1]
        assert args[1]['batchSize'] == 50
Beispiel #11
0
    def test_get_cursor_type(self):
        DB._instances = {}
        db = DB()._db
        query = [{'$limit': 5}]

        c = Cursor(query, db.classifications)

        assert isinstance(c.getCursor(), pymongo.command_cursor.CommandCursor)
Beispiel #12
0
    def get_classifications():
        """
        Get the cursor containing classifications from db

        Returns
        -------
        swap.db.Cursor
            Cursor with classifications
        """
        return DB().classifications.getClassifications()
Beispiel #13
0
    def random(self, size, gold=None):
        """
        Get a random sample of gold labels

        Parameters
        ----------
        size : int
            Sample size
        """
        logger.debug('Size %d gold filter %s', size, gold)
        return DB().golds.get_random_golds(size, gold)
Beispiel #14
0
    def subjects(self, subject_ids):
        """
        Get the gold labels for a set of subjects

        Parameters
        ----------
        subject_ids : list
            List of subject ids (int)
        """
        logger.debug('getting %d subjects', len(subject_ids))
        return DB().golds.get_golds(subject_ids)
Beispiel #15
0
    def consensus(self, size):
        """
        Get the gold labels for the most consensus subjects

        Parameters
        ----------
        size : int
            Number of subjects
        """
        logger.debug('Size %d', size)
        subjects = db_cv().get_consensus(size)
        return DB().golds.get_golds(subjects)
Beispiel #16
0
    def test_get_classifications_1(self, mock):
        query = [
            {'$sort': OrderedDict([
                ('seen_before', 1),
                ('classification_id', 1)])},
            {'$match': {'seen_before': False}},
            {'$project': {
                'user_id': 1, 'subject_id': 1,
                'annotation': 1, 'session_id': 1}}]

        DB().classifications.getClassifications()
        mock.assert_called_with(query, {'batchSize': 100000})
Beispiel #17
0
    def run(self, amount=None):
        """
        Process all classifications in DB with SWAP

        .. note::
            Iterates through the classification collection of the
            database and proccesss each classification one at a time
            in the order returned by the db.
            Parameters like max_batch_size are hard-coded.
            Prints status.
        """

        if amount is None:
            amount = DB().classifications.get_stats()
            amount = amount['first_classifications']

        self.init_swap()

        # get classifications
        cursor = self.get_classifications()

        # loop over classification cursor to process
        # classifications one at a time
        logger.info('Start: SWAP Processing %d classifications', amount)

        count = 0
        with progressbar.ProgressBar(max_value=amount) as bar:
            bar.update(count)
            # Loop over all classifications of the query
            # Note that the exact size of the query might be lower than
            # n_classifications if not all classifications are being queried
            for cl in cursor:
                # process classification in swap
                cl = Classification.generate(cl)
                self._delegate(cl)
                bar.update(count)
                count += 1

                if config.control.debug and count > config.control.amount:
                    break

        if config.back_update:
            logger.info('back_update active: processing changes')
            self.swap.process_changes()
        logger.info('done')
Beispiel #18
0
    def classify(self, raw_cl):
        # Add classification from caesar
        data = self.parse_raw(raw_cl)
        cl = self.gen_cl(data)

        logger.debug('Checking if already received classification')
        if not self.cl_exists(data):

            logger.debug('Uploading classification to caesar db: %s',
                         str(data))
            DB().caesar.insert(data)

            logger.debug('Adding classification from network: %s',
                         str(cl))

            self.swap.classify(cl)

            subject = self.swap.subjects.get(cl.subject)
            return subject
Beispiel #19
0
    def getClassifications(self):
        """ Returns Iterator over all Classifications """

        # fields to project
        fields = ['user_name', 'subject_id', 'annotation', 'gold_label']

        # if meta data is requested
        if self.meta_data is not None:
            meta_data_field = 'metadata' + "." + self.meta_data
            fields.append('metadata')
            fields[fields.index('metadata')] = meta_data_field

        # Define a query
        q = Query()
        q.project(fields)

        # range query on metadata
        if self.meta_lower is not None and self.meta_upper is not None:
            q.match_range(meta_data_field, self.meta_lower, self.meta_upper)

        # perform query on classification data
        classifications = DB().classifications.aggregate(q.build())

        return classifications
Beispiel #20
0
    def call(self, args):
        swap = None
        scores = None

        if args.load:
            obj = self.load(args.load[0])

            if isinstance(obj, SWAP):
                swap = obj
                scores = swap.score_export()
            elif isinstance(obj, ScoreExport):
                scores = obj

        if args.scores_from_csv:
            fname = args.scores_from_csv[0]
            scores = ScoreExport.from_csv(fname)

        if args.run:
            swap = self.run_swap(args)
            scores = swap.score_export()

        if swap is not None:

            if args.save:
                manifest = self.manifest(swap, args)
                self.save(swap, self.f(args.save[0]), manifest)

            if args.log:
                fname = self.f(args.log[0])
                write_log(swap, fname)

            if args.stats:
                s = swap.stats_str()
                print(s)
                logger.debug(s)

            if args.test:
                from swap.utils.golds import GoldGetter
                gg = GoldGetter()
                logger.debug('applying new gold labels')
                swap.set_gold_labels(gg.golds)
                swap.process_changes()
                logger.debug('done')

            if args.test_reorder:
                self.reorder_classifications(swap)

            if args.export_user_scores:
                fname = self.f(args.export_user_scores[0])
                self.export_user_scores(swap, fname)

        if scores is not None:
            if args.save_scores:
                DB().subjects.save_scores(scores)

            if args.scores_to_csv:
                self.scores_to_csv(scores, args.scores_to_csv[0])

        self.plot(args, swap, scores)

        if args.shell:
            import code
            code.interact(local=locals())

        return swap
Beispiel #21
0
 def all(self):
     """
     Get all gold labels
     """
     return DB().golds.get_golds()
Beispiel #22
0
class GoldGetter:
    """
    Compile a set of gold labels given a set of parameters
    """
    def __init__(self):
        self.getters = []
        self._golds = None
        self.db = DB().golds

    @_getter
    def all(self):
        """
        Get all gold labels
        """
        return self.db.get_golds()

    @_getter
    def random(self, size):
        """
        Get a random sample of gold labels

        Parameters
        ----------
        size : int
            Sample size
        """
        return self.db.get_random_golds(size)

    @_getter
    def subjects(self, subject_ids):
        """
        Get the gold labels for a set of subjects

        Parameters
        ----------
        subject_ids : list
            List of subject ids (int)
        """
        return self.db.get_golds(subject_ids)

    @_getter
    def controversial(self, size):
        """
        Get the gold labels for the most controversial subjects

        Parameters
        ----------
        size : int
            Number of subjects
        """
        subjects = db_cv().get_controversial(size)
        return self.db.get_golds(subjects)

    @_getter
    def consensus(self, size):
        """
        Get the gold labels for the most consensus subjects

        Parameters
        ----------
        size : int
            Number of subjects
        """
        subjects = db_cv().get_consensus(size)
        return self.db.get_golds(subjects)

    @_getter
    def these(self, golds):
        return golds

    # @_getter
    # def extreme_min(self, n_controv, max_consensus):
    #     def f():
    #         controv = cv.get_controversial(n_controv)
    #         consensus = cv.get_max_consensus(max_consensus)

    #         return db.getExpertGold(controv + consensus)
    #     return f

    # @_getter
    # def extremes(self, n_controv, n_consensus):
    #     def f():
    #         controv = cv.get_controversial(n_controv)
    #         consensus = cv.get_consensus(n_consensus)

    #         return db.getExpertGold(controv + consensus)
    #     return f

    def reset(self):
        """
        Reset the gold getter.

        Clears the set of golds and list of getters.
        """
        self.getters = []
        self._golds = None

    @property
    def golds(self):
        """
        Returns the set of golds. Fetches from database the first
        time and caches for faster recall.
        """
        if self._golds is None:
            if len(self.getters) == 0:
                self.all()

            golds = {}
            for getter in self.getters:
                golds.update(getter())

            self._golds = golds
        return self._golds

    def __iter__(self):
        return self.golds
Beispiel #23
0
def db_cv():
    return DB().controversial
Beispiel #24
0
def db_cl():
    return DB().classifications
Beispiel #25
0
    def cl_exists(cl):
        def id_(cl):
            return cl['classification_id']

        return DB().caesar.exists(id_(cl)) or \
            DB().classifications.exists(id_(cl))
Beispiel #26
0
 def get_swap_scores():
     return DB().subjects.get_scores()
Beispiel #27
0
 def __init__(self):
     self.getters = []
     self._golds = None
     self.db = DB().golds