Beispiel #1
0
    def test_adjust_constant(self):

        num = 10
        dim = 3

        proba = np.random.rand(num, dim)
        proba /= proba.sum(axis=1, keepdims=True)

        self.assertTrue(
            np.allclose(proba, probability.adjust_proba(proba, [1, 1, 1])))
        self.assertTrue(
            np.allclose(proba, probability.adjust_proba(proba, [10, 10, 10])))
Beispiel #2
0
    def test_single_class(self):

        num = 10
        dim = 1

        proba = np.random.rand(num, dim)
        self.assertTrue(np.allclose(1, probability.adjust_proba(proba, [1])))

        proba = np.random.rand(num, dim)
        self.assertTrue(np.allclose(1, probability.adjust_proba(proba, [-1])))

        proba = np.ones_like(proba)
        self.assertTrue(np.allclose(1, probability.adjust_proba(proba, [1])))
Beispiel #3
0
    def test_adjust_serial_vs_sum(self):

        num = 10
        dim = 3

        proba = np.random.rand(num, dim)
        proba /= proba.sum(axis=1, keepdims=True)

        adj1 = np.array([1, 2, 3])
        adj2 = np.array([2, 0, -2])

        proba_fst = probability.adjust_proba(proba, adj1)
        proba_snd = probability.adjust_proba(proba_fst, adj2)
        proba_sum = probability.adjust_proba(proba, adj1 + adj2)

        self.assertTrue(np.allclose(proba_snd, proba_sum))

        proba_fst = probability.adjust_proba(proba, adj1)
        proba_snd = probability.adjust_proba(proba_fst, -adj1)

        self.assertTrue(np.allclose(proba_snd, proba))
Beispiel #4
0
    def test_adjust(self):

        num = 10
        dim = 3

        proba = np.random.rand(num, dim)
        proba /= proba.sum(axis=1, keepdims=True)

        adj = [0, 1, 0]

        proba_post = probability.adjust_proba(proba, adj)
        comp = proba_post > proba
        self.assertTrue(np.all([False, True, False] == comp))
        comp = proba_post < proba
        self.assertTrue(np.all([True, False, True] == comp))
        comp = np.isclose(proba, proba_post)
        self.assertFalse(np.any(comp))

        adj = [-1, 0, 0]

        proba_post = probability.adjust_proba(proba, adj)
        comp = proba_post < proba
        self.assertTrue(np.all([True, False, False] == comp))
        comp = proba_post > proba
        self.assertTrue(np.all([False, True, True] == comp))
        comp = np.isclose(proba, proba_post)
        self.assertFalse(np.any(comp))

        adj = [1.5, 0, -1.5]

        proba_post = probability.adjust_proba(proba, adj)
        comp = proba_post < proba
        self.assertTrue(np.all([False, True] == comp[:, [0, 2]]))
        comp = proba_post > proba
        self.assertTrue(np.all([True, False] == comp[:, [0, 2]]))
        comp = np.isclose(proba, proba_post)
        self.assertFalse(np.all([False, True, False] == comp))
Beispiel #5
0
    def test_shape_cases(self):

        num = 10
        dim = 3

        proba = np.random.rand(num, dim)
        self.assertRaisesRegex(
            ValueError, "The dimensions of probabilities and "
            "adjustments must be compatible.", probability.adjust_proba, proba,
            [1, 2])

        proba = np.random.rand(1, dim)
        proba /= proba.sum()
        self.assertTrue(
            np.allclose(proba, probability.adjust_proba(proba, [1, 1, 1])))

        self.assertRaisesRegex(
            ValueError, "The dimensions of probabilities and "
            "adjustments must be compatible.", probability.adjust_proba,
            np.ones((num, 1)), np.ones((1, num)))
Beispiel #6
0
    def classify(self):
        """
        Given a file's bytes (standard base64-format) and content mimetype,
        describe and classify the content against all currently stored
        classifiers (optionally a list of requested classifiers), returning a
        map of classifier descriptive labels to their class-to-probability
        results.

        We expect the data to be transmitted in the body of the request in
        standard base64 encoding form ("bytes_b64" key). We look for the
        content type either as URL parameter or within the body
        ("content_type" key).

        Below is an example call to this endpoint via the ``requests`` python
        module, showing how base64 data is sent::

            import base64
            import requests
            data_bytes = "Load some content bytes here."
            requests.post('http://localhost:5000/classify',
                          data={'bytes_b64': base64.b64encode(data_bytes),
                                'content_type': 'text/plain'})

        With curl on the command line::

            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file > /path/to/file.b64
            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                --data-urlencode bytes_64@/path/to/file.b64

        Optionally, the `label` parameter can be provided to limit the results
        of classification to a set of classifiers::

            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                -d 'label=["some_label","other_label"]' \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file > /path/to/file.b64
            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                -d 'label=["some_label","other_label"]' \
                --data-urlencode bytes_64@/path/to/file.b64

        Data/Form arguments:
            bytes_b64
                Bytes in the standard base64 encoding to be described and
                classified.
            content_type
                The mimetype of the sent data.
            label
                (Optional) JSON-encoded label or list of labels
            adjustment
                (Optional) JSON-encoded dictionary of labels to floats. Higher
                values lower the gain on the class and therefore correspond to
                higher precision (and lower recall) for the class (and higher
                recall/lower precision for other classes). This translates
                to calling ``smqtk.utils.probability.adjust_proba``.

        Possible error codes:
            400
                No bytes provided, or provided labels are malformed
            404
                Label or labels provided do not match any registered
                classifier

        Returns: {
            ...
            result: {
                classifier-label: {
                    class-label: prob,
                    ...
                },
                ...
            }
        }

        """
        data_b64 = flask.request.values.get('bytes_b64', default=None)
        content_type = flask.request.values.get('content_type', default=None)
        label_str = flask.request.values.get('label', default=None)
        adjustment_str = flask.request.values.get('adjustment', default=None)

        labels = None
        if label_str is not None:
            try:
                labels = flask.json.loads(label_str)

                if isinstance(labels, six.string_types):
                    labels = [labels]
                elif isinstance(labels, list):
                    for el in labels:
                        if not isinstance(el, six.string_types):
                            return make_response_json(
                                "Label must be a list of strings or a"
                                " single string.", 400)
                else:
                    return make_response_json(
                        "Label must be a list of strings or a single"
                        " string.", 400)

            except JSON_DECODE_EXCEPTION:
                # Unquoted strings aren't valid JSON. That is, a plain string
                # needs to be passed as '"label"' rather than just 'label' or
                # "label". However, we can be a bit more generous and just
                # allow such a string, but we have to place *some* restriction
                # on it. We use `urllib.quote` for this since essentially it
                # just checks to make sure that the string is made up of one
                # of the following types of characters:
                #
                #   - letters
                #   - numbers
                #   - spaces, underscores, periods, and dashes
                #
                # Since the concept of a "letter" is fraught with encoding and
                # locality issues, we simply let urllib make this decision for
                # us.

                # If label_str matches the url-encoded version of itself, go
                # ahead and use it
                if urllib.parse.quote(label_str, safe='') == label_str:
                    labels = [label_str]
                else:
                    return make_response_json(
                        "Label(s) are not properly formatted JSON.", 400)

        # Collect optional result probability adjustment values
        #: :type: dict[collections.Hashable, float]
        adjustments = {}
        if adjustment_str is not None:
            try:
                #: :type: dict[collections.Hashable, float]
                adjustments = flask.json.loads(adjustment_str)

                for label, val in six.iteritems(adjustments):
                    if not isinstance(label, six.string_types):
                        return make_response_json(
                            "Adjustment label '%s' is not a string type." %
                            label, 400)
                    if not isinstance(val, (int, float)):
                        return make_response_json(
                            "Adjustment value %s for label '%s' is not an int "
                            "or float" % (val, label), 400)
            except JSON_DECODE_EXCEPTION:
                return make_response_json(
                    "Adjustment(s) are not properly formatted JSON.", 400)

        if data_b64 is None:
            return make_response_json("No base-64 bytes provided.", 400)
        elif content_type is None:
            return make_response_json("No content type provided.", 400)

        data_bytes = base64.b64decode(data_b64.encode('utf-8'))
        self._log.debug("Length of byte data: %d" % len(data_bytes))

        data_elem = DataMemoryElement(data_bytes, content_type, readonly=True)
        descr_elem = self.descriptor_gen.generate_one_element(
            data_elem, descr_factory=self.descriptor_factory)
        self._log.debug("Descriptor shape: %s", descr_elem.vector().shape)

        try:
            clfr_map = self.classifier_collection.classify(
                descr_elem, labels=labels, factory=self.classification_factory)
        except MissingLabelError as ex:
            return make_response_json(
                "The following labels are not registered with any"
                " classifiers: '%s'" % "', '".join(ex.labels),
                404,
                missing_labels=list(ex.labels))

        # Transform classification result into JSON
        c_json = {}
        for classifier_label, c_elem in six.iteritems(clfr_map):
            prediction = c_elem.get_classification()
            if adjustments:
                proba_labels = list(prediction.keys())
                proba = [prediction[k] for k in proba_labels]
                # Use opposite of adjustments, because we already set the
                # convention of "higher: precision, lower: recall"
                adj = [-adjustments.get(label, 0.0) for label in proba_labels]
                adj_proba = probability.adjust_proba(proba, adj)
                prediction = dict(zip(proba_labels, adj_proba[0]))
            c_json[classifier_label] = prediction

        return make_response_json('Finished classification.', result=c_json)
Beispiel #7
0
    def classify(self):
        """
        Given a file's bytes (standard base64-format) and content mimetype,
        describe and classify the content against all currently stored
        classifiers (optionally a list of requested classifiers), returning a
        map of classifier descriptive labels to their class-to-probability
        results.

        We expect the data to be transmitted in the body of the request in
        standard base64 encoding form ("bytes_b64" key). We look for the
        content type either as URL parameter or within the body
        ("content_type" key).

        Below is an example call to this endpoint via the ``requests`` python
        module, showing how base64 data is sent::

            import base64
            import requests
            data_bytes = "Load some content bytes here."
            requests.post('http://localhost:5000/classify',
                          data={'bytes_b64': base64.b64encode(data_bytes),
                                'content_type': 'text/plain'})

        With curl on the command line::

            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file > /path/to/file.b64
            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                --data-urlencode bytes_64@/path/to/file.b64

        Optionally, the `label` parameter can be provided to limit the results
        of classification to a set of classifiers::

            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                -d 'label=["some_label","other_label"]' \
                --data-urlencode "bytes_b64=$(base64 -w0 /path/to/file)"

            # If this fails, you may wish to encode the file separately and
            # use the file reference syntax instead:

            $ base64 -w0 /path/to/file > /path/to/file.b64
            $ curl -X POST localhost:5000/classify \
                -d "content_type=text/plain" \
                -d 'label=["some_label","other_label"]' \
                --data-urlencode bytes_64@/path/to/file.b64

        Data/Form arguments:
            bytes_b64
                Bytes in the standard base64 encoding to be described and
                classified.
            content_type
                The mimetype of the sent data.
            label
                (Optional) JSON-encoded label or list of labels
            adjustment
                (Optional) JSON-encoded dictionary of labels to floats. Higher
                values lower the gain on the class and therefore correspond to
                higher precision (and lower recall) for the class (and higher
                recall/lower precision for other classes). This translates
                to calling ``smqtk.utils.probability.adjust_proba``.

        Possible error codes:
            400
                No bytes provided, or provided labels are malformed
            404
                Label or labels provided do not match any registered
                classifier

        Returns: {
            ...
            result: {
                classifier-label: {
                    class-label: prob,
                    ...
                },
                ...
            }
        }

        """
        data_b64 = flask.request.values.get('bytes_b64', default=None)
        content_type = flask.request.values.get('content_type', default=None)
        label_str = flask.request.values.get('label', default=None)
        adjustment_str = flask.request.values.get('adjustment', default=None)

        try:
            labels = labels_from_input(label_str)
        except ValueError as ex:
            return make_response_json(f"Invalid label(s) specified: {ex}", 400)

        # Collect optional result probability adjustment values
        #: :type: dict[collections.abc.Hashable, float]
        adjustments = {}
        if adjustment_str is not None:
            try:
                #: :type: dict[collections.abc.Hashable, float]
                adjustments = json.loads(adjustment_str)

                for label, val in six.iteritems(adjustments):
                    if not isinstance(label, six.string_types):
                        return make_response_json(
                            "Adjustment label '%s' is not a string type."
                            % label,
                            400)
                    if not isinstance(val, (int, float)):
                        return make_response_json(
                            "Adjustment value %s for label '%s' is not an int "
                            "or float" % (val, label),
                            400)
            except json.JSONDecodeError:
                return make_response_json(
                    "Adjustment(s) are not properly formatted JSON.", 400)

        if data_b64 is None:
            return make_response_json("No base-64 bytes provided.", 400)
        elif content_type is None:
            return make_response_json("No content type provided.", 400)

        data_bytes = base64.b64decode(data_b64.encode('utf-8'))
        self._log.debug("Length of byte data: %d" % len(data_bytes))

        data_elem = DataMemoryElement(data_bytes, content_type, readonly=True)
        descr_elem = self.descriptor_gen.generate_one_element(
            data_elem, descr_factory=self.descriptor_factory
        )
        self._log.debug("Descriptor shape: %s", descr_elem.vector().shape)

        try:
            clfr_map = self.classifier_collection.classify(
                descr_elem, labels=labels,
                factory=self.classification_factory)
        except MissingLabelError as ex:
            return make_response_json(
                "The following labels are not registered with any"
                " classifiers: '%s'"
                % "', '".join(ex.labels),
                404,
                missing_labels=list(ex.labels))

        # Transform classification result into JSON
        c_json = {}
        for classifier_label, c_elem in six.iteritems(clfr_map):
            prediction = c_elem.get_classification()
            if adjustments:
                proba_labels = list(prediction.keys())
                proba = [prediction[k] for k in proba_labels]
                # Use opposite of adjustments, because we already set the
                # convention of "higher: precision, lower: recall"
                adj = [-adjustments.get(label, 0.0) for label in proba_labels]
                adj_proba = probability.adjust_proba(proba, adj)
                prediction = dict(zip(proba_labels, adj_proba[0]))
            c_json[classifier_label] = prediction

        return make_response_json('Finished classification.',
                                  result=c_json)