Esempio n. 1
0
    def test_nested_threading(self):
        # char -> char -> ord -> char -> ord
        g1 = parallel_map(lambda e: e, self.test_string,
                          ordered=True,
                          use_multiprocessing=False,
                          cores=2,
                          name='g1')
        g2 = parallel_map(ord, g1,
                          ordered=True,
                          use_multiprocessing=False,
                          cores=2,
                          name='g2')
        g3 = parallel_map(chr, g2,
                          ordered=True,
                          use_multiprocessing=False,
                          cores=2,
                          name='g3')
        g4 = parallel_map(ord, g3,
                          ordered=True,
                          use_multiprocessing=False,
                          cores=2,
                          name='g4')

        expected = list(map(ord, self.test_string))
        self.assertEqual(
            list(g4),
            expected
        )
Esempio n. 2
0
    def test_nested_multiprocessing(self):
        # char -> char -> ord -> char -> ord
        g1 = parallel_map(lambda e: e,
                          self.test_string,
                          ordered=True,
                          use_multiprocessing=True,
                          cores=2)
        g2 = parallel_map(ord,
                          g1,
                          ordered=True,
                          use_multiprocessing=True,
                          cores=2)
        g3 = parallel_map(chr,
                          g2,
                          ordered=True,
                          use_multiprocessing=True,
                          cores=2)
        g4 = parallel_map(ord,
                          g3,
                          ordered=True,
                          use_multiprocessing=True,
                          cores=2)

        expected = list(map(ord, self.test_string))
        self.assertEqual(list(g4), expected)
Esempio n. 3
0
    def test_nested_threading(self):
        # char -> char -> ord -> char -> ord
        g1 = parallel_map(lambda e: e,
                          self.test_string,
                          ordered=True,
                          use_multiprocessing=False,
                          cores=2,
                          name='g1')
        g2 = parallel_map(ord,
                          g1,
                          ordered=True,
                          use_multiprocessing=False,
                          cores=2,
                          name='g2')
        g3 = parallel_map(chr,
                          g2,
                          ordered=True,
                          use_multiprocessing=False,
                          cores=2,
                          name='g3')
        g4 = parallel_map(ord,
                          g3,
                          ordered=True,
                          use_multiprocessing=False,
                          cores=2,
                          name='g4')

        expected = map(ord, self.test_string)
        nose.tools.assert_equal(list(g4), expected)
Esempio n. 4
0
def iter_valid_elements(dataElementUris, valid_content_types):
    """
    Find the GirderDataElements which are loadable images and
    valid according to valid_content_types.

    :param dataElementUris: A list of Girder Data Element URIs.
    :param valid_content_types: A list of valid content types, generally
        passed by a descriptor generator.
    :returns: A generator over valid GirderDataElements.
    :rtype: generator
    """
    def is_valid(dataElementUri):
        dfe = GirderDataElement.from_uri(dataElementUri)

        if is_valid_element(dfe,
                            valid_content_types=valid_content_types,
                            check_image=True):
            return dfe
        else:
            return False

    return itertools.ifilter(
        None,
        parallel.parallel_map(is_valid,
                              dataElementUris,
                              use_multiprocessing=False))
Esempio n. 5
0
    def test_exception_handing_multiprocess(self):
        def raise_ex(_):
            raise RuntimeError("Expected exception")

        nose.tools.assert_raises(
            RuntimeError, list,
            parallel_map(raise_ex, [1], use_multiprocessing=True))
Esempio n. 6
0
    def iter_valid_elements():
        def is_valid(file_path):
            e = DataFileElement(file_path)

            if is_valid_element(
                    e,
                    valid_content_types=generator.valid_content_types(),
                    check_image=check_image):
                return e
            else:
                return False

        data_elements: Deque[DataFileElement] = collections.deque()
        valid_files_filter = parallel.parallel_map(is_valid,
                                                   file_paths,
                                                   name="check-file-type",
                                                   use_multiprocessing=True)
        for dfe in valid_files_filter:
            if dfe:
                yield dfe
                if data_set is not None:
                    data_elements.append(dfe)
                    if batch_size and len(data_elements) == batch_size:
                        log.debug(
                            "Adding data element batch to set (size: %d)",
                            len(data_elements))
                        data_set.add_data(*data_elements)
                        data_elements.clear()
        # elements only collected if we have a data-set configured, so add any
        # still in the deque to the set
        if data_set is not None and data_elements:
            log.debug("Adding data elements to set (size: %d",
                      len(data_elements))
            data_set.add_data(*data_elements)
Esempio n. 7
0
    def _get_many_vectors(cls, descriptors):
        """
        Internal method to be overridden by subclasses to return many vectors
        associated with given descriptors.

        :note: Returned vectors are *not* guaranteed to be returned in the
            order they are requested. Missing vectors may be returned as None
            or omitted entirely from results. The wrapper function
            `get_many_vectors` handles re-ordering as necessary and insertion
            of None for missing values.

        :param descriptors: Iterable of descriptors to query for.
        :type descriptors: collections.Iterable[
            smqtk.representation.descriptor_element.DescriptorElement]

        :return: Iterator of tuples containing the descriptor uuid and the
            vector associated with the given descriptors or None if the
            descriptor has no associated vector
        :rtype: collections.Iterable[
            tuple[collections.Hashable, Union[numpy.ndarray, None]]]
        """
        for uuid_vector_pair in parallel_map(_uuid_and_vector_from_descriptor,
                                             descriptors,
                                             name='retrieve_vectors'):
            yield uuid_vector_pair
Esempio n. 8
0
    def test_exception_handing_threaded(self):
        def raise_ex(_):
            raise RuntimeError("Expected exception")

        self.assertRaises(
            RuntimeError, list,
            parallel_map(raise_ex, [1], use_multiprocessing=False))
Esempio n. 9
0
 def parallel_iter_vectors(descriptors):
     """ Get the vectors for the descriptors given.
     Not caring about order returned.
     """
     return parallel.parallel_map(lambda d: d.vector(),
                                  descriptors,
                                  use_multiprocessing=False)
Esempio n. 10
0
 def test_simple_unordered_threaded(self):
     r = list(
         parallel_map(self.test_func,
                      self.test_string,
                      ordered=False,
                      use_multiprocessing=False))
     self.assertEqual(set(r), set(self.expected))
Esempio n. 11
0
 def test_simple_ordered_multiprocess(self):
     r = list(
         parallel_map(self.test_func,
                      self.test_string,
                      ordered=True,
                      use_multiprocessing=True))
     self.assertEqual(r, self.expected)
Esempio n. 12
0
 def test_simple_unordered_multiprocess(self):
     r = list(
         parallel_map(self.test_func,
                      self.test_string,
                      ordered=False,
                      use_multiprocessing=True))
     nose.tools.assert_equal(set(r), set(self.expected))
Esempio n. 13
0
 def iter_valid_elements():
     valid_files_filter = parallel.parallel_map(is_valid_element,
                                                file_paths,
                                                name="check-file-type",
                                                use_multiprocessing=True)
     for dfe in valid_files_filter:
         if dfe is not None:
             yield dfe
Esempio n. 14
0
 def test_simple_ordered_threaded(self):
     # Make sure results are still in order as requested
     r = list(
         parallel_map(self.test_func,
                      self.test_string,
                      ordered=True,
                      use_multiprocessing=False))
     nose.tools.assert_equal(r, self.expected)
Esempio n. 15
0
 def iter_valid_elements():
     valid_files_filter = parallel.parallel_map(is_valid_element,
                                                file_paths,
                                                name="check-file-type",
                                                use_multiprocessing=True)
     for dfe in valid_files_filter:
         if dfe is not None:
             yield dfe
Esempio n. 16
0
    def compute_descriptor_async(self, data_iter,
                                 descr_factory=DFLT_DESCRIPTOR_FACTORY,
                                 overwrite=False, procs=None, **kwds):
        """
        Asynchronously compute feature data for multiple data items.

        Base implementation additional keyword arguments:
            use_mp [= False]
                If multi-processing should be used vs. multi-threading.

        :param data_iter: Iterable of data elements to compute features for.
            These must have UIDs assigned for feature association in return
            value.
        :type data_iter: collections.Iterable[smqtk.representation.DataElement]

        :param descr_factory: Factory instance to produce the wrapping
            descriptor element instance. The default factory produces
            ``DescriptorMemoryElement`` instances by default.
        :type descr_factory: smqtk.representation.DescriptorElementFactory

        :param overwrite: Whether or not to force re-computation of a descriptor
            vectors for the given data even when there exists precomputed
            vectors in the generated DescriptorElements as generated from the
            provided factory. This will overwrite the persistently stored
            vectors if the provided factory produces a DescriptorElement
            implementation such storage.
        :type overwrite: bool

        :param procs: Optional specification of how many processors to use
            when pooling sub-tasks. If None, we attempt to use all available
            cores.
        :type procs: int | None

        :raises ValueError: An input DataElement was of a content type that we
            cannot handle.

        :return: Mapping of input DataElement UUIDs to the computed descriptor
            element for that data. DescriptorElement UUID's are congruent with
            the UUID of the data element it is the descriptor of.
        :rtype: dict[collections.Hashable,
                     smqtk.representation.DescriptorElement]

        """
        self._log.info("Async compute features")

        use_mp = kwds.get('use_mp', False)

        def work(d):
            return d, self.compute_descriptor(d, descr_factory, overwrite)

        results = parallel_map(work, data_iter, cores=procs, ordered=False,
                               use_multiprocessing=use_mp)

        de_map = {}
        for data, descriptor in results:
            de_map[data.uuid()] = descriptor

        return de_map
Esempio n. 17
0
    def compute_descriptor_async(self, data_iter,
                                 descr_factory=DFLT_DESCRIPTOR_FACTORY,
                                 overwrite=False, procs=None, **kwds):
        """
        Asynchronously compute feature data for multiple data items.

        Base implementation additional keyword arguments:
            use_mp [= False]
                If multi-processing should be used vs. multi-threading.

        :param data_iter: Iterable of data elements to compute features for.
            These must have UIDs assigned for feature association in return
            value.
        :type data_iter: collections.Iterable[smqtk.representation.DataElement]

        :param descr_factory: Factory instance to produce the wrapping
            descriptor element instance. The default factory produces
            ``DescriptorMemoryElement`` instances by default.
        :type descr_factory: smqtk.representation.DescriptorElementFactory

        :param overwrite: Whether or not to force re-computation of a descriptor
            vectors for the given data even when there exists precomputed
            vectors in the generated DescriptorElements as generated from the
            provided factory. This will overwrite the persistently stored
            vectors if the provided factory produces a DescriptorElement
            implementation such storage.
        :type overwrite: bool

        :param procs: Optional specification of how many processors to use
            when pooling sub-tasks. If None, we attempt to use all available
            cores.
        :type procs: int | None

        :raises ValueError: An input DataElement was of a content type that we
            cannot handle.

        :return: Mapping of input DataElement UUIDs to the computed descriptor
            element for that data. DescriptorElement UUID's are congruent with
            the UUID of the data element it is the descriptor of.
        :rtype: dict[collections.Hashable,
                     smqtk.representation.DescriptorElement]

        """
        self._log.info("Async compute features")

        use_mp = kwds.get('use_mp', False)

        def work(d):
            return d, self.compute_descriptor(d, descr_factory, overwrite)

        results = parallel_map(work, data_iter, cores=procs, ordered=False,
                               use_multiprocessing=use_mp)

        de_map = {}
        for data, descriptor in results:
            de_map[data.uuid()] = descriptor

        return de_map
Esempio n. 18
0
    def test_exception_handing_threaded(self):
        def raise_ex(_):
            raise RuntimeError("Expected exception")

        nose.tools.assert_raises(
            RuntimeError,
            list,
            parallel_map(raise_ex, [1], use_multiprocessing=False)
        )
Esempio n. 19
0
    def test_exception_handing_multiprocess(self):
        def raise_ex(_):
            raise RuntimeError("Expected exception")

        self.assertRaises(
            RuntimeError,
            list,
            parallel_map(raise_ex, [1], use_multiprocessing=True)
        )
Esempio n. 20
0
    def test_multisequence(self):
        def test_func(a, b, c):
            return a + b + c

        s1 = [1] * 10
        s2 = [2] * 10
        s3 = [3] * 10
        r = list(parallel_map(test_func, s1, s2, s3,
                              use_multiprocessing=False))

        expected = [6] * 10
        nose.tools.assert_equal(r, expected)
Esempio n. 21
0
    def test_multisequence(self):
        def test_func(a, b, c):
            return a + b + c

        s1 = [1] * 10
        s2 = [2] * 10
        s3 = [3] * 10
        r = list(parallel_map(test_func, s1, s2, s3,
                              use_multiprocessing=False))

        expected = [6] * 10
        self.assertEqual(r, expected)
Esempio n. 22
0
    def test_multisequence_short_cutoff(self):
        def test_func(a, b, c):
            return a + b + c

        s1 = [1] * 10
        s2 = [2] * 4
        s3 = [3] * 10
        r = list(parallel_map(test_func, s1, s2, s3,
                              use_multiprocessing=False,
                              ordered=True))

        exp = [6] * 4
        nose.tools.assert_equal(r, exp)
Esempio n. 23
0
    def test_multisequence_short_cutoff(self):
        def test_func(a, b, c):
            return a + b + c

        s1 = [1] * 10
        s2 = [2] * 4
        s3 = [3] * 10
        r = list(parallel_map(test_func, s1, s2, s3,
                              use_multiprocessing=False,
                              ordered=True))

        exp = [6] * 4
        self.assertEqual(r, exp)
Esempio n. 24
0
    def build_index(self, descriptors):
        """
        Build the index based on the given iterable of descriptor elements.

        Subsequent calls to this method should rebuild the index, not add to
        it.

        :raises ValueError: No data available in the given iterable.

        :param descriptors:
            Iterable of descriptor elements to build index over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        # ordered cache of descriptors in our index.
        self._descr_cache = []
        # Reverse mapping of a descriptor's vector to its index in the cache
        # and subsequently in the distance kernel.
        self._descr2index = {}
        # matrix for creating distance kernel
        self._descr_matrix = []

        def get_vector(d_elem):
            return d_elem, d_elem.vector()

        # noinspection PyTypeChecker
        vector_iter = parallel_map(get_vector,
                                   descriptors,
                                   name='vector_iter',
                                   use_multiprocessing=self.multiprocess_fetch,
                                   cores=self.cores,
                                   ordered=True)

        for i, (d, v) in enumerate(vector_iter):
            self._descr_cache.append(d)
            # ``_descr_matrix`` is a list, currently.
            # noinspection PyUnresolvedReferences
            self._descr_matrix.append(v)
            self._descr2index[tuple(v)] = i
        self._descr_matrix = numpy.array(self._descr_matrix)

        # TODO: (?) For when we optimize SVM SV kernel computation
        # self._dist_kernel = \
        #    compute_distance_kernel(self._descr_matrix,
        #                            histogram_intersection_distance2,
        #                            row_wise=True)

        if self.descr_cache_fp:
            with open(self.descr_cache_fp, 'wb') as f:
                pickle.dump(self._descr_cache, f, -1)
Esempio n. 25
0
    def test_multisequence_fill_void(self):
        def test_func(a, b, c):
            return a + b + c

        s1 = [1] * 10
        s2 = [2] * 4
        s3 = [3] * 10
        r = list(parallel_map(test_func, s1, s2, s3,
                              use_multiprocessing=False,
                              fill_void=10,
                              ordered=True))

        expected = [6] * 4 + [14] * 6
        nose.tools.assert_equal(r, expected)
Esempio n. 26
0
    def test_multisequence_fill_void(self):
        def test_func(a, b, c):
            return a + b + c

        s1 = [1] * 10
        s2 = [2] * 4
        s3 = [3] * 10
        r = list(parallel_map(test_func, s1, s2, s3,
                              use_multiprocessing=False,
                              fill_void=10,
                              ordered=True))

        expected = [6] * 4 + [14] * 6
        self.assertEqual(r, expected)
Esempio n. 27
0
    def build_index(self, descriptors):
        """
        Build the index based on the given iterable of descriptor elements.

        Subsequent calls to this method should rebuild the index, not add to
        it.

        :raises ValueError: No data available in the given iterable.

        :param descriptors:
            Iterable of descriptor elements to build index over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        # ordered cache of descriptors in our index.
        self._descr_cache = []
        # Reverse mapping of a descriptor's vector to its index in the cache
        # and subsequently in the distance kernel.
        self._descr2index = {}
        # matrix for creating distance kernel
        self._descr_matrix = []

        def get_vector(d_elem):
            return d_elem, d_elem.vector()

        # noinspection PyTypeChecker
        vector_iter = parallel_map(get_vector, descriptors,
                                   name='vector_iter',
                                   use_multiprocessing=self.multiprocess_fetch,
                                   cores=self.cores,
                                   ordered=True)

        for i, (d, v) in enumerate(vector_iter):
            self._descr_cache.append(d)
            # ``_descr_matrix`` is a list, currently.
            # noinspection PyUnresolvedReferences
            self._descr_matrix.append(v)
            self._descr2index[tuple(v)] = i
        self._descr_matrix = numpy.array(self._descr_matrix)

        # TODO: (?) For when we optimize SVM SV kernel computation
        # self._dist_kernel = \
        #    compute_distance_kernel(self._descr_matrix,
        #                            histogram_intersection_distance2,
        #                            row_wise=True)

        if self.descr_cache_fp:
            with open(self.descr_cache_fp, 'wb') as f:
                pickle.dump(self._descr_cache, f, -1)
Esempio n. 28
0
    def test_nested_multiprocessing(self):
        # char -> char -> ord -> char -> ord
        g1 = parallel_map(lambda e: e, self.test_string,
                          ordered=True,
                          use_multiprocessing=True,
                          cores=2)
        g2 = parallel_map(ord, g1,
                          ordered=True,
                          use_multiprocessing=True,
                          cores=2)
        g3 = parallel_map(chr, g2,
                          ordered=True,
                          use_multiprocessing=True,
                          cores=2)
        g4 = parallel_map(ord, g3,
                          ordered=True,
                          use_multiprocessing=True,
                          cores=2)

        expected = map(ord, self.test_string)
        nose.tools.assert_equal(
            list(g4),
            expected
        )
Esempio n. 29
0
def main():
    # Print help and exit if no arguments were passed
    if len(sys.argv) == 1:
        get_cli_parser().print_help()
        sys.exit(1)

    args = get_cli_parser().parse_args()
    llevel = logging.INFO if not args.verbose else logging.DEBUG
    initialize_logging(logging.getLogger('smqtk'), llevel)
    initialize_logging(logging.getLogger('__main__'), llevel)

    log = logging.getLogger(__name__)
    log.debug('Showing debug messages.')

    if args.file_list is not None and not os.path.exists(args.file_list):
        log.error('Invalid file list path: %s', args.file_list)
        exit(103)

    def check_image(image_path):
        if not os.path.exists(image_path):
            log.warn('Invalid image path given (does not exist): %s',
                     image_path)
            return False, False
        else:
            d = DataFileElement(image_path)
            return is_valid_element(d, check_image=True), d

    with open(args.file_list) as infile:
        checked_images = parallel.parallel_map(check_image,
                                               map(str.strip, infile),
                                               name='check-image-validity',
                                               use_multiprocessing=True)

        for is_valid, dfe in checked_images:
            if dfe:  # in the case of a non-existent file
                if (is_valid and not args.invert) or \
                        (not is_valid and args.invert):
                    # We know the callback above is creating DataFileElement
                    # instances.
                    # noinspection PyProtectedMember
                    print('%s,%s' % (dfe._filepath, dfe.uuid()))
Esempio n. 30
0
def iter_valid_elements(dataElementUris, valid_content_types):
    """
    Find the GirderDataElements which are loadable images and
    valid according to valid_content_types.

    :param dataElementUris: A list of Girder Data Element URIs.
    :param valid_content_types: A list of valid content types, generally
        passed by a descriptor generator.
    :returns: A generator over valid GirderDataElements.
    :rtype: generator
    """
    def is_valid(dataElementUri):
        dfe = GirderDataElement.from_uri(dataElementUri)

        if is_valid_element(dfe,
                            valid_content_types=valid_content_types,
                            check_image=True):
            return dfe
        else:
            return False

    return itertools.ifilter(None, parallel.parallel_map(is_valid,
                                                         dataElementUris,
                                                         use_multiprocessing=False))
Esempio n. 31
0
                while i < total:
                    b_start = i
                    b_end = i + batch_size
                    for h in q[b_start:b_end].execute():
                        # noinspection PyProtectedMember
                        yield h.meta._d_
                        i += 1
            except elasticsearch.ConnectionTimeout, ex:
                log.warning("ElasticSearch timed out (error = %s)", str(ex))
                restart = True
                log.debug("Restarting query from index %d", i)

    log.info("Initializing image download/record parallel iterator")
    img_dl_records = parallel_map(
        dl_image, iter_scan_meta(),
        name='image_download',
        use_multiprocessing=True,
        cores=cores
    )

    # Write out
    log.info("Starting iteration/file-write")
    rp_state = [0] * 7
    with open(scan_record, 'w') as record_file:
        for r in img_dl_records:
            if r is not None:
                cdr_id, local_path, uuid = r
                record_file.write('%s,%s,%s\n'
                                  % (cdr_id, local_path, uuid))
            report_progress(log.debug, rp_state, 1.0)
        # Final report
        rp_state[1] -= 1
Esempio n. 32
0
def main():
    description = """
    Script for asynchronously computing classifications for DescriptorElements
    in a DescriptorIndex specified via a list of UUIDs. Results are output to a
    CSV file in the format:

        uuid, label1_confidence, label2_confidence, ...

    CSV columns labels are output to the given CSV header file path. Label
    columns will be in the order as reported by the classifier implementations
    ``get_labels`` method.

    Due to using an input file-list of UUIDs, we require that the UUIDs of
    indexed descriptors be strings, or equality comparable to the UUIDs' string
    representation.
    """

    args, config = bin_utils.utility_main_helper(
        default_config,
        description,
        extend_parser,
    )
    log = logging.getLogger(__name__)

    # - parallel_map UUIDs to load from the configured index
    # - classify iterated descriptors

    uuids_list_filepath = args.uuids_list
    output_csv_filepath = args.csv_data
    output_csv_header_filepath = args.csv_header
    classify_overwrite = config['utility']['classify_overwrite']

    p_use_multiprocessing = \
        config['utility']['parallel']['use_multiprocessing']
    p_index_extraction_cores = \
        config['utility']['parallel']['index_extraction_cores']
    p_classification_cores = \
        config['utility']['parallel']['classification_cores']

    if not uuids_list_filepath:
        raise ValueError("No uuids_list_filepath specified.")
    elif not os.path.isfile(uuids_list_filepath):
        raise ValueError("Given uuids_list_filepath did not point to a file.")
    if output_csv_header_filepath is None:
        raise ValueError("Need a path to save CSV header labels")
    if output_csv_filepath is None:
        raise ValueError("Need a path to save CSV data.")

    #
    # Initialize configured plugins
    #

    log.info("Initializing descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(
        config['plugins']['descriptor_index'],
        get_descriptor_index_impls()
    )

    log.info("Initializing classification factory")
    c_factory = ClassificationElementFactory.from_config(
        config['plugins']['classification_factory']
    )

    log.info("Initializing classifier")
    #: :type: smqtk.algorithms.Classifier
    classifier = plugin.from_plugin_config(
        config['plugins']['classifier'], get_classifier_impls()
    )

    #
    # Setup/Process
    #
    def iter_uuids():
        with open(uuids_list_filepath) as f:
            for l in f:
                yield l.strip()

    def descr_for_uuid(uuid):
        """
        :type uuid: collections.Hashable
        :rtype: smqtk.representation.DescriptorElement
        """
        return descriptor_index.get_descriptor(uuid)

    def classify_descr(d):
        """
        :type d: smqtk.representation.DescriptorElement
        :rtype: smqtk.representation.ClassificationElement
        """
        return classifier.classify(d, c_factory, classify_overwrite)

    log.info("Initializing uuid-to-descriptor parallel map")
    #: :type: collections.Iterable[smqtk.representation.DescriptorElement]
    element_iter = parallel.parallel_map(
        descr_for_uuid, iter_uuids(),
        use_multiprocessing=p_use_multiprocessing,
        cores=p_index_extraction_cores,
        name="descr_for_uuid",
    )

    log.info("Initializing descriptor-to-classification parallel map")
    #: :type: collections.Iterable[smqtk.representation.ClassificationElement]
    classification_iter = parallel.parallel_map(
        classify_descr, element_iter,
        use_multiprocessing=p_use_multiprocessing,
        cores=p_classification_cores,
        name='classify_descr',
    )

    #
    # Write/Output files
    #

    c_labels = classifier.get_labels()

    def make_row(c):
        """
        :type c: smqtk.representation.ClassificationElement
        """
        c_m = c.get_classification()
        return [c.uuid] + [c_m[l] for l in c_labels]

    # column labels file
    log.info("Writing CSV column header file: %s", output_csv_header_filepath)
    file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath))
    with open(output_csv_header_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        w.writerow(['uuid'] + c_labels)

    # CSV file
    log.info("Writing CSV data file: %s", output_csv_filepath)
    file_utils.safe_create_dir(os.path.dirname(output_csv_filepath))
    r_state = [0] * 7
    with open(output_csv_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        for c in classification_iter:
            w.writerow(make_row(c))
            bin_utils.report_progress(log.info, r_state, 1.0)

    # Final report
    r_state[1] -= 1
    bin_utils.report_progress(log.info, r_state, 0)

    log.info("Done")
Esempio n. 33
0
def main():
    args = cli_parser().parse_args()
    config = bin_utils.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    #
    # Initialize stuff from configuration
    #
    #: :type: smqtk.algorithms.Classifier
    classifier = plugin.from_plugin_config(
        config['plugins']['classifier'],
        get_classifier_impls()
    )
    #: :type: ClassificationElementFactory
    classification_factory = ClassificationElementFactory.from_config(
        config['plugins']['classification_factory']
    )
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(
        config['plugins']['descriptor_index'],
        get_descriptor_index_impls()
    )

    uuid2label_filepath = config['utility']['csv_filepath']
    do_train = config['utility']['train']
    output_uuid_cm = config['utility']['output_uuid_confusion_matrix']
    plot_filepath_pr = config['utility']['output_plot_pr']
    plot_filepath_roc = config['utility']['output_plot_roc']
    plot_filepath_cm = config['utility']['output_plot_confusion_matrix']
    plot_ci = config['utility']['curve_confidence_interval']
    plot_ci_alpha = config['utility']['curve_confidence_interval_alpha']

    #
    # Construct mapping of label to the DescriptorElement instances for that
    # described by that label.
    #
    log.info("Loading descriptors by UUID")

    def iter_uuid_label():
        """ Iterate through UUIDs in specified file """
        with open(uuid2label_filepath) as uuid2label_file:
            reader = csv.reader(uuid2label_file)
            for r in reader:
                # TODO: This will need to be updated to handle multiple labels
                #       per descriptor.
                yield r[0], r[1]

    def get_descr(r):
        """ Fetch descriptors from configured index """
        uuid, truth_label = r
        return truth_label, descriptor_index.get_descriptor(uuid)

    tlabel_element_iter = parallel.parallel_map(
        get_descr, iter_uuid_label(),
        name="cmv_get_descriptors",
        use_multiprocessing=True,
        cores=config['parallelism']['descriptor_fetch_cores'],
    )

    # Map of truth labels to descriptors of labeled data
    #: :type: dict[str, list[smqtk.representation.DescriptorElement]]
    tlabel2descriptors = {}
    for tlabel, d in tlabel_element_iter:
        tlabel2descriptors.setdefault(tlabel, []).append(d)

    # Train classifier if the one given has a ``train`` method and training
    # was turned enabled.
    if do_train:
        if isinstance(classifier, SupervisedClassifier):
            log.info("Training classifier model")
            classifier.train(tlabel2descriptors)
            exit(0)
        else:
            ValueError("Configured classifier is not a SupervisedClassifier "
                       "type and does not support training.")

    #
    # Apply classifier to descriptors for predictions
    #

    # Truth label to predicted classification results
    #: :type: dict[str, set[smqtk.representation.ClassificationElement]]
    tlabel2classifications = {}
    for tlabel, descriptors in six.iteritems(tlabel2descriptors):
        tlabel2classifications[tlabel] = \
            set(classifier.classify_async(
                descriptors, classification_factory,
                use_multiprocessing=True,
                procs=config['parallelism']['classification_cores'],
                ri=1.0,
            ).values())
    log.info("Truth label counts:")
    for l in sorted(tlabel2classifications):
        log.info("  %s :: %d", l, len(tlabel2classifications[l]))

    #
    # Confusion Matrix
    #
    conf_mat, labels = gen_confusion_matrix(tlabel2classifications)
    log.info("Confusion_matrix")
    log_cm(log.info, conf_mat, labels)
    if plot_filepath_cm:
        plot_cm(conf_mat, labels, plot_filepath_cm)

    # Confusion Matrix of descriptor UUIDs to output json
    if output_uuid_cm:
        # Top dictionary keys are true labels, inner dictionary keys are UUID
        # predicted labels.
        log.info("Computing UUID Confusion Matrix")
        #: :type: dict[str, dict[collections.Hashable, set]]
        uuid_cm = {}
        for tlabel in tlabel2classifications:
            uuid_cm[tlabel] = collections.defaultdict(set)
            for c in tlabel2classifications[tlabel]:
                uuid_cm[tlabel][c.max_label()].add(c.uuid)
            # convert sets to lists
            for plabel in uuid_cm[tlabel]:
                uuid_cm[tlabel][plabel] = list(uuid_cm[tlabel][plabel])
        with open(output_uuid_cm, 'w') as f:
            log.info("Saving UUID Confusion Matrix: %s", output_uuid_cm)
            json.dump(uuid_cm, f, indent=2, separators=(',', ': '))

    #
    # Create PR/ROC curves via scikit learn tools
    #
    if plot_filepath_pr:
        log.info("Making PR curve")
        make_pr_curves(tlabel2classifications, plot_filepath_pr,
                       plot_ci, plot_ci_alpha)
    if plot_filepath_roc:
        log.info("Making ROC curve")
        make_roc_curves(tlabel2classifications, plot_filepath_roc,
                        plot_ci, plot_ci_alpha)
Esempio n. 34
0
def mb_kmeans_build_apply(index, mbkm, initial_fit_size):
    """
    Build the MiniBatchKMeans centroids based on the descriptors in the given
    index, then predicting descriptor clusters with the final result model.

    If the given index is empty, no fitting or clustering occurs and an empty
    dictionary is returned.

    :param index: Index of descriptors
    :type index: smqtk.representation.DescriptorIndex

    :param mbkm: Scikit-Learn MiniBatchKMeans instead to train and then use for
        prediction
    :type mbkm: sklearn.cluster.MiniBatchKMeans

    :param initial_fit_size: Number of descriptors to run an initial fit with.
        This brings the advantage of choosing a best initialization point from
        multiple.
    :type initial_fit_size: int

    :return: Dictionary of the cluster label (integer) to the set of descriptor
        UUIDs belonging to that cluster.
    :rtype: dict[int, set[collections.Hashable]]

    """
    log = logging.getLogger(__name__)

    ifit_completed = False
    k_deque = collections.deque()
    d_fitted = 0

    log.info("Getting index keys (shuffled)")
    index_keys = sorted(six.iterkeys(index))
    numpy.random.seed(mbkm.random_state)
    numpy.random.shuffle(index_keys)

    def parallel_iter_vectors(descriptors):
        """ Get the vectors for the descriptors given.
        Not caring about order returned.
        """
        return parallel.parallel_map(lambda d: d.vector(), descriptors,
                                     use_multiprocessing=False)

    def get_vectors(k_iter):
        """ Get numpy array of descriptor vectors (2D array returned) """
        return numpy.array(list(
            parallel_iter_vectors(index.get_many_descriptors(k_iter))
        ))

    log.info("Collecting iteratively fitting model")
    pr = cli.ProgressReporter(log.debug, 1.0).start()
    for i, k in enumerate(index_keys):
        k_deque.append(k)
        pr.increment_report()

        if initial_fit_size and not ifit_completed:
            if len(k_deque) == initial_fit_size:
                log.info("Initial fit using %d descriptors", len(k_deque))
                log.info("- collecting vectors")
                vectors = get_vectors(k_deque)
                log.info("- fitting model")
                mbkm.fit(vectors)
                log.info("- cleaning")
                d_fitted += len(vectors)
                k_deque.clear()
                ifit_completed = True
        elif len(k_deque) == mbkm.batch_size:
            log.info("Partial fit with batch size %d", len(k_deque))
            log.info("- collecting vectors")
            vectors = get_vectors(k_deque)
            log.info("- fitting model")
            mbkm.partial_fit(vectors)
            log.info("- cleaning")
            d_fitted += len(k_deque)
            k_deque.clear()
    pr.report()

    # Final fit with any remaining descriptors
    if k_deque:
        log.info("Final partial fit of size %d", len(k_deque))
        log.info('- collecting vectors')
        vectors = get_vectors(k_deque)
        log.info('- fitting model')
        mbkm.partial_fit(vectors)
        log.info('- cleaning')
        d_fitted += len(k_deque)
        k_deque.clear()

    log.info("Computing descriptor classes with final KMeans model")
    mbkm.verbose = False
    d_classes = collections.defaultdict(set)
    d_uv_iter = parallel.parallel_map(lambda d: (d.uuid(), d.vector()),
                                      index,
                                      use_multiprocessing=False,
                                      name="uv-collector")
    # TODO: Batch predict call inputs to something larger than one at a time.
    d_uc_iter = parallel.parallel_map(
        lambda u_v: (u_v[0], mbkm.predict(u_v[1][numpy.newaxis, :])[0]),
        d_uv_iter,
        use_multiprocessing=False,
        name="uc-collector")
    pr = cli.ProgressReporter(log.debug, 1.0).start()
    for uuid, c in d_uc_iter:
        d_classes[c].add(uuid)
        pr.increment_report()
    pr.report()

    return d_classes
Esempio n. 35
0
def compute_hash_codes(uuids, index, functor, report_interval=1.0, use_mp=False,
                       ordered=False):
    """
    Given an iterable of DescriptorElement UUIDs, asynchronously access them
    from the given ``index``, asynchronously compute hash codes via ``functor``
    and convert to an integer, yielding (UUID, hash-int) pairs.

    :param uuids: Sequence of UUIDs to process
    :type uuids: collections.Iterable[collections.Hashable]

    :param index: Descriptor index to pull from.
    :type index: smqtk.representation.descriptor_index.DescriptorIndex

    :param functor: LSH hash code functor instance
    :type functor: smqtk.algorithms.LshFunctor

    :param report_interval: Frequency in seconds at which we report speed and
        completion progress via logging. Reporting is disabled when logging
        is not in debug and this value is greater than 0.
    :type report_interval: float

    :param use_mp: If multiprocessing should be used for parallel
        computation vs. threading. Reminder: This will copy currently loaded
        objects onto worker processes (e.g. the given index), which could lead
        to dangerously high RAM consumption.
    :type use_mp: bool

    :param ordered: If the element-hash value pairs yielded are in the same
        order as element UUID values input. This function should be slightly
        faster when ordering is not required.
    :type ordered: bool

    :return: Generator instance yielding (DescriptorElement, int) value pairs.

    """
    # TODO: parallel map fetch elements from index?
    #       -> separately from compute

    def get_hash(u):
        v = index.get_descriptor(u).vector()
        return u, bit_utils.bit_vector_to_int_large(functor.get_hash(v))

    # Setup log and reporting function
    log = logging.getLogger(__name__)

    if log.getEffectiveLevel() > logging.DEBUG or report_interval <= 0:
        def log_func(*_, **__):
            return
        log.debug("Not logging progress")
    else:
        log.debug("Logging progress at %f second intervals", report_interval)
        log_func = log.debug

    log.debug("Starting computation")
    reporter = bin_utils.ProgressReporter(log_func, report_interval)
    reporter.start()
    for uuid, hash_int in parallel.parallel_map(get_hash, uuids,
                                                ordered=ordered,
                                                use_multiprocessing=use_mp):
        yield (uuid, hash_int)
        # Progress reporting
        reporter.increment_report()

    # Final report
    reporter.report()
Esempio n. 36
0
def compute_hash_codes(uuids, index, functor, hash2uuids=None,
                       report_interval=1.0, use_mp=False):
    """
    Given an iterable of DescriptorElement UUIDs, asynchronously access them
    from the given ``index``, asynchronously compute hash codes via ``functor``
    and  convert to an integer, yielding (DescriptorElement, hash-int) pairs.

    The dictionary input and returned is of the same format used by the
    ``LSHNearestNeighborIndex`` implementation (mapping pointed to by the
    ``hash2uuid_cache_filepath`` attribute).

    :param uuids: Sequence of UUIDs to process
    :type uuids: collections.Iterable[collections.Hashable]

    :param index: Descriptor index to pull from.
    :type index: smqtk.representation.descriptor_index.DescriptorIndex

    :param functor: LSH hash code functor instance
    :type functor: smqtk.algorithms.LshFunctor

    :param hash2uuids: Hash code to UUID set to update, which is also returned
        from this function. If not provided, we will start a new mapping, which
        is returned instead.
    :type hash2uuids: dict[int|long, set[collections.Hashable]]

    :param report_interval: Frequency in seconds at which we report speed and
        completion progress via logging. Reporting is disabled when logging
        is not in debug and this value is greater than 0.
    :type report_interval: float

    :param use_mp: If multiprocessing should be used for parallel
        computation vs. threading. Reminder: This will copy currently loaded
        objects onto worker processes (e.g. the given index), which could lead
        to dangerously high RAM consumption.
    :type use_mp: bool

    :return: The ``update_map`` provided or, if None was provided, a new
        mapping.
    :rtype: dict[int|long, set[collections.Hashable]]

    """
    if hash2uuids is None:
        hash2uuids = {}

    # TODO: parallel map fetch elements from index?
    #       -> separately from compute

    def get_hash(u):
        v = index.get_descriptor(u).vector()
        return u, bit_utils.bit_vector_to_int_large(functor.get_hash(v))

    # Setup log and reporting function
    log = logging.getLogger(__name__)
    report_state = [0] * 7

    # noinspection PyGlobalUndefined
    if log.getEffectiveLevel() > logging.DEBUG or report_interval <= 0:
        def report_progress(*_):
            return
        log.debug("Not logging progress")
    else:
        log.debug("Logging progress at %f second intervals", report_interval)
        report_progress = bin_utils.report_progress

    log.debug("Starting computation")
    for uuid, hash_int in parallel.parallel_map(get_hash, uuids,
                                                ordered=False,
                                                use_multiprocessing=use_mp):
        if hash_int not in hash2uuids:
            hash2uuids[hash_int] = set()
        hash2uuids[hash_int].add(uuid)

        # Progress reporting
        report_progress(log.debug, report_state, report_interval)

    # Final report
    report_state[1] -= 1
    report_progress(log.debug, report_state, 0.0)

    return hash2uuids
Esempio n. 37
0
 def parallel_iter_vectors(descriptors):
     """ Get the vectors for the descriptors given.
     Not caring about order returned.
     """
     return parallel.parallel_map(lambda d: d.vector(), descriptors,
                                  use_multiprocessing=False)
Esempio n. 38
0
def compute_hash_codes(uuids,
                       index,
                       functor,
                       hash2uuids=None,
                       report_interval=1.0,
                       use_mp=False):
    """
    Given an iterable of DescriptorElement UUIDs, asynchronously access them
    from the given ``index``, asynchronously compute hash codes via ``functor``
    and  convert to an integer, yielding (DescriptorElement, hash-int) pairs.

    The dictionary input and returned is of the same format used by the
    ``LSHNearestNeighborIndex`` implementation (mapping pointed to by the
    ``hash2uuid_cache_filepath`` attribute).

    :param uuids: Sequence of UUIDs to process
    :type uuids: collections.Iterable[collections.Hashable]

    :param index: Descriptor index to pull from.
    :type index: smqtk.representation.descriptor_index.DescriptorIndex

    :param functor: LSH hash code functor instance
    :type functor: smqtk.algorithms.LshFunctor

    :param hash2uuids: Hash code to UUID set to update, which is also returned
        from this function. If not provided, we will start a new mapping, which
        is returned instead.
    :type hash2uuids: dict[int|long, set[collections.Hashable]]

    :param report_interval: Frequency in seconds at which we report speed and
        completion progress via logging. Reporting is disabled when logging
        is not in debug and this value is greater than 0.
    :type report_interval: float

    :param use_mp: If multiprocessing should be used for parallel
        computation vs. threading. Reminder: This will copy currently loaded
        objects onto worker processes (e.g. the given index), which could lead
        to dangerously high RAM consumption.
    :type use_mp: bool

    :return: The ``update_map`` provided or, if None was provided, a new
        mapping.
    :rtype: dict[int|long, set[collections.Hashable]]

    """
    if hash2uuids is None:
        hash2uuids = {}

    # TODO: parallel map fetch elements from index?
    #       -> separately from compute

    def get_hash(u):
        v = index.get_descriptor(u).vector()
        return u, bit_utils.bit_vector_to_int_large(functor.get_hash(v))

    # Setup log and reporting function
    log = logging.getLogger(__name__)
    report_state = [0] * 7

    # noinspection PyGlobalUndefined
    if log.getEffectiveLevel() > logging.DEBUG or report_interval <= 0:

        def report_progress(*_):
            return

        log.debug("Not logging progress")
    else:
        log.debug("Logging progress at %f second intervals", report_interval)
        report_progress = bin_utils.report_progress

    log.debug("Starting computation")
    for uuid, hash_int in parallel.parallel_map(get_hash,
                                                uuids,
                                                ordered=False,
                                                use_multiprocessing=use_mp):
        if hash_int not in hash2uuids:
            hash2uuids[hash_int] = set()
        hash2uuids[hash_int].add(uuid)

        # Progress reporting
        report_progress(log.debug, report_state, report_interval)

    # Final report
    report_state[1] -= 1
    report_progress(log.debug, report_state, 0.0)

    return hash2uuids
Esempio n. 39
0
log.info("Loading resource files")
with open(ad_image_csv) as f:
    url_ad_label_rows = list(csv.reader(f))
with open(ad_phone_csv) as f:
    ad2phone = dict(csv.reader(f))

# Download unique img_urls, get filepaths + SHA1 checksum
url_set = set(r[0] for r in url_ad_label_rows)
print "%d unique URLs" % len(url_set)
# URL to (filepath, sha1sum) tuple
#: :type: dict[str, (str, str)]
url2fs = {}
for url, save_pth, sha1 in parallel_map(dl_ad_image, url_set,
                                        itertools.repeat(image_output_dir),
                                        name='image_downloader',
                                        use_multiprocessing=True,
                                        # cores=32,
                                        cores=128,
                                        ):
    if url:
        url2fs[url] = (save_pth, sha1)
log.info("Downloaded %d images", len(url2fs))

log.info("Forming relational mappings")
# save mapping of SHA1 to filepath
# save mapping of CDR-ID to set of child image SHA1s
# save mapping of CDR-ID to label
#: :type: dict[str, str]
sha2path = {}
#: :type: dict[str, set[str]]
ad2shas = collections.defaultdict(set)
def main():
    args = cli_parser().parse_args()
    config = bin_utils.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    #
    # Initialize stuff from configuration
    #
    #: :type: smqtk.algorithms.Classifier
    classifier = plugin.from_plugin_config(config['plugins']['classifier'],
                                           get_classifier_impls())
    #: :type: ClassificationElementFactory
    classification_factory = ClassificationElementFactory.from_config(
        config['plugins']['classification_factory'])
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(
        config['plugins']['descriptor_index'], get_descriptor_index_impls())

    uuid2label_filepath = config['utility']['csv_filepath']
    do_train = config['utility']['train']
    output_uuid_cm = config['utility']['output_uuid_confusion_matrix']
    plot_filepath_pr = config['utility']['output_plot_pr']
    plot_filepath_roc = config['utility']['output_plot_roc']
    plot_filepath_cm = config['utility']['output_plot_confusion_matrix']
    plot_ci = config['utility']['curve_confidence_interval']
    plot_ci_alpha = config['utility']['curve_confidence_interval_alpha']

    #
    # Construct mapping of label to the DescriptorElement instances for that
    # described by that label.
    #
    log.info("Loading descriptors by UUID")

    def iter_uuid_label():
        """ Iterate through UUIDs in specified file """
        with open(uuid2label_filepath) as uuid2label_file:
            reader = csv.reader(uuid2label_file)
            for r in reader:
                # TODO: This will need to be updated to handle multiple labels
                #       per descriptor.
                yield r[0], r[1]

    def get_descr(r):
        """ Fetch descriptors from configured index """
        uuid, truth_label = r
        return truth_label, descriptor_index.get_descriptor(uuid)

    tlabel_element_iter = parallel.parallel_map(
        get_descr,
        iter_uuid_label(),
        name="cmv_get_descriptors",
        use_multiprocessing=True,
        cores=config['parallelism']['descriptor_fetch_cores'],
    )

    # Map of truth labels to descriptors of labeled data
    #: :type: dict[str, list[smqtk.representation.DescriptorElement]]
    tlabel2descriptors = {}
    for tlabel, d in tlabel_element_iter:
        tlabel2descriptors.setdefault(tlabel, []).append(d)

    # Train classifier if the one given has a ``train`` method and training
    # was turned enabled.
    if do_train:
        if isinstance(classifier, SupervisedClassifier):
            log.info("Training classifier model")
            classifier.train(tlabel2descriptors)
            exit(0)
        else:
            ValueError("Configured classifier is not a SupervisedClassifier "
                       "type and does not support training.")

    #
    # Apply classifier to descriptors for predictions
    #

    # Truth label to predicted classification results
    #: :type: dict[str, set[smqtk.representation.ClassificationElement]]
    tlabel2classifications = {}
    for tlabel, descriptors in tlabel2descriptors.items():
        tlabel2classifications[tlabel] = \
            set(classifier.classify_async(
                descriptors, classification_factory,
                use_multiprocessing=True,
                procs=config['parallelism']['classification_cores'],
                ri=1.0,
            ).values())
    log.info("Truth label counts:")
    for l in sorted(tlabel2classifications):
        log.info("  %s :: %d", l, len(tlabel2classifications[l]))

    #
    # Confusion Matrix
    #
    conf_mat, labels = gen_confusion_matrix(tlabel2classifications)
    log.info("Confusion_matrix")
    log_cm(log.info, conf_mat, labels)
    if plot_filepath_cm:
        plot_cm(conf_mat, labels, plot_filepath_cm)

    # CM of descriptor UUIDs to output json
    if output_uuid_cm:
        # Top dictionary keys are true labels, inner dictionary keys are UUID
        # predicted labels.
        log.info("Computing UUID Confusion Matrix")
        #: :type: dict[str, dict[str, set | list]]
        uuid_cm = {}
        for tlabel in tlabel2classifications:
            uuid_cm[tlabel] = collections.defaultdict(set)
            for c in tlabel2classifications[tlabel]:
                uuid_cm[tlabel][c.max_label()].add(c.uuid)
            # convert sets to lists
            for plabel in uuid_cm[tlabel]:
                uuid_cm[tlabel][plabel] = list(uuid_cm[tlabel][plabel])
        with open(output_uuid_cm, 'w') as f:
            log.info("Saving UUID Confusion Matrix: %s", output_uuid_cm)
            json.dump(uuid_cm, f, indent=2, separators=(',', ': '))

    #
    # Create PR/ROC curves via scikit learn tools
    #
    if plot_filepath_pr:
        log.info("Making PR curve")
        make_pr_curves(tlabel2classifications, plot_filepath_pr, plot_ci,
                       plot_ci_alpha)
    if plot_filepath_roc:
        log.info("Making ROC curve")
        make_roc_curves(tlabel2classifications, plot_filepath_roc, plot_ci,
                        plot_ci_alpha)
Esempio n. 41
0
    def classify_async(self, d_iter, factory, overwrite=False, procs=None,
                       use_multiprocessing=False, ri=None):
        """
        Asynchronously classify the DescriptorElements in the given iterable.

        :param d_iter: Iterable of DescriptorElements
        :type d_iter:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param factory: Classifier element factory to use for element
            generation.
        :type factory: smqtk.representation.ClassificationElementFactory

        :param overwrite: Recompute classification of the input descriptor and
            set the results to the ClassificationElement produced by the
            factory.
        :type overwrite: bool

        :param procs: Explicit number of cores/thread/processes to use.
        :type procs: None | int

        :param use_multiprocessing: Use ``multiprocessing.pool.Pool`` instead of
            ``multiprocessing.pool.ThreadPool``.
        :type use_multiprocessing: bool

        :param ri: Progress reporting interval in seconds. Set to a value > 0 to
            enable. Disabled by default.
        :type ri: float | None

        :return: Mapping of input DescriptorElement instances to the computed
            ClassificationElement. ClassificationElement UUID's are congruent
            with the UUID of the DescriptorElement
        :rtype: dict[smqtk.representation.DescriptorElement,
                     smqtk.representation.ClassificationElement]

        """
        self._log.debug("Async classifying descriptors")
        ri = ri and ri > 0 and ri

        def work(d_elem):
            return d_elem, self.classify(d_elem, factory, overwrite)

        classifications = parallel.parallel_map(
            work, d_iter,
            cores=procs,
            ordered=False,
            use_multiprocessing=use_multiprocessing,
        )

        r_state = [0] * 7
        if ri:
            r_progress = bin_utils.report_progress
        else:
            def r_progress(*_):
                return

        d2c_map = {}
        for d, c in classifications:
            d2c_map[d] = c

            r_progress(self._log.debug, r_state, ri)

        return d2c_map
Esempio n. 42
0
def main():
    args = cli_parser().parse_args()
    config = bin_utils.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    # - parallel_map UUIDs to load from the configured index
    # - classify iterated descriptors

    uuids_list_filepath = args.uuids_list
    output_csv_filepath = args.csv_data
    output_csv_header_filepath = args.csv_header
    classify_overwrite = config['utility']['classify_overwrite']

    p_use_multiprocessing = \
        config['utility']['parallel']['use_multiprocessing']
    p_index_extraction_cores = \
        config['utility']['parallel']['index_extraction_cores']
    p_classification_cores = \
        config['utility']['parallel']['classification_cores']

    if not uuids_list_filepath:
        raise ValueError("No uuids_list_filepath specified.")
    elif not os.path.isfile(uuids_list_filepath):
        raise ValueError("Given uuids_list_filepath did not point to a file.")
    if output_csv_header_filepath is None:
        raise ValueError("Need a path to save CSV header labels")
    if output_csv_filepath is None:
        raise ValueError("Need a path to save CSV data.")

    #
    # Initialize configured plugins
    #

    log.info("Initializing descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(
        config['plugins']['descriptor_index'], get_descriptor_index_impls())

    log.info("Initializing classification factory")
    c_factory = ClassificationElementFactory.from_config(
        config['plugins']['classification_factory'])

    log.info("Initializing classifier")
    #: :type: smqtk.algorithms.Classifier
    classifier = plugin.from_plugin_config(config['plugins']['classifier'],
                                           get_classifier_impls())

    #
    # Setup/Process
    #
    def iter_uuids():
        with open(uuids_list_filepath) as f:
            for l in f:
                yield l.strip()

    def descr_for_uuid(uuid):
        """
        :type uuid: collections.Hashable
        :rtype: smqtk.representation.DescriptorElement
        """
        return descriptor_index.get_descriptor(uuid)

    def classify_descr(d):
        """
        :type d: smqtk.representation.DescriptorElement
        :rtype: smqtk.representation.ClassificationElement
        """
        return classifier.classify(d, c_factory, classify_overwrite)

    log.info("Initializing uuid-to-descriptor parallel map")
    #: :type: collections.Iterable[smqtk.representation.DescriptorElement]
    element_iter = parallel.parallel_map(
        descr_for_uuid,
        iter_uuids(),
        use_multiprocessing=p_use_multiprocessing,
        cores=p_index_extraction_cores,
        name="descr_for_uuid",
    )

    log.info("Initializing descriptor-to-classification parallel map")
    #: :type: collections.Iterable[smqtk.representation.ClassificationElement]
    classification_iter = parallel.parallel_map(
        classify_descr,
        element_iter,
        use_multiprocessing=p_use_multiprocessing,
        cores=p_classification_cores,
        name='classify_descr',
    )

    #
    # Write/Output files
    #

    c_labels = classifier.get_labels()

    def make_row(c):
        """
        :type c: smqtk.representation.ClassificationElement
        """
        c_m = c.get_classification()
        return [c.uuid] + [c_m[l] for l in c_labels]

    # column labels file
    log.info("Writing CSV column header file: %s", output_csv_header_filepath)
    file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath))
    with open(output_csv_header_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        w.writerow(['uuid'] + c_labels)

    # CSV file
    log.info("Writing CSV data file: %s", output_csv_filepath)
    file_utils.safe_create_dir(os.path.dirname(output_csv_filepath))
    r_state = [0] * 7
    with open(output_csv_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        for c in classification_iter:
            w.writerow(make_row(c))
            bin_utils.report_progress(log.info, r_state, 1.0)

    # Final report
    r_state[1] -= 1
    bin_utils.report_progress(log.info, r_state, 0)

    log.info("Done")
Esempio n. 43
0
def main():
    args = cli_parser().parse_args()
    config = bin_utils.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    # - parallel_map UUIDs to load from the configured index
    # - classify iterated descriptors

    uuids_list_filepath = args.uuids_list
    output_csv_filepath = args.csv_data
    output_csv_header_filepath = args.csv_header
    classify_overwrite = config['utility']['classify_overwrite']

    p_use_multiprocessing = \
        config['utility']['parallel']['use_multiprocessing']
    p_index_extraction_cores = \
        config['utility']['parallel']['index_extraction_cores']
    p_classification_cores = \
        config['utility']['parallel']['classification_cores']

    if not uuids_list_filepath:
        raise ValueError("No uuids_list_filepath specified.")
    elif not os.path.isfile(uuids_list_filepath):
        raise ValueError("Given uuids_list_filepath did not point to a file.")
    if output_csv_header_filepath is None:
        raise ValueError("Need a path to save CSV header labels")
    if output_csv_filepath is None:
        raise ValueError("Need a path to save CSV data.")

    #
    # Initialize configured plugins
    #

    log.info("Initializing descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(
        config['plugins']['descriptor_index'],
        get_descriptor_index_impls()
    )

    log.info("Initializing classification factory")
    c_factory = ClassificationElementFactory.from_config(
        config['plugins']['classification_factory']
    )

    log.info("Initializing classifier")
    #: :type: smqtk.algorithms.Classifier
    classifier = plugin.from_plugin_config(
        config['plugins']['classifier'], get_classifier_impls()
    )

    #
    # Setup/Process
    #
    def iter_uuids():
        with open(uuids_list_filepath) as f:
            for l in f:
                yield l.strip()

    def descr_for_uuid(uuid):
        """
        :type uuid: collections.Hashable
        :rtype: smqtk.representation.DescriptorElement
        """
        return descriptor_index.get_descriptor(uuid)

    def classify_descr(d):
        """
        :type d: smqtk.representation.DescriptorElement
        :rtype: smqtk.representation.ClassificationElement
        """
        return classifier.classify(d, c_factory, classify_overwrite)

    log.info("Initializing uuid-to-descriptor parallel map")
    #: :type: collections.Iterable[smqtk.representation.DescriptorElement]
    element_iter = parallel.parallel_map(
        descr_for_uuid, iter_uuids(),
        use_multiprocessing=p_use_multiprocessing,
        cores=p_index_extraction_cores,
        name="descr_for_uuid",
    )

    log.info("Initializing descriptor-to-classification parallel map")
    #: :type: collections.Iterable[smqtk.representation.ClassificationElement]
    classification_iter = parallel.parallel_map(
        classify_descr, element_iter,
        use_multiprocessing=p_use_multiprocessing,
        cores=p_classification_cores,
        name='classify_descr',
    )

    #
    # Write/Output files
    #

    c_labels = classifier.get_labels()

    def make_row(e):
        """
        :type e: smqtk.representation.ClassificationElement
        """
        c_m = e.get_classification()
        return [e.uuid] + [c_m[l] for l in c_labels]

    # column labels file
    log.info("Writing CSV column header file: %s", output_csv_header_filepath)
    file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath))
    with open(output_csv_header_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        w.writerow(['uuid'] + [str(cl) for cl in c_labels])

    # CSV file
    log.info("Writing CSV data file: %s", output_csv_filepath)
    file_utils.safe_create_dir(os.path.dirname(output_csv_filepath))
    r_state = [0] * 7
    with open(output_csv_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        for c in classification_iter:
            w.writerow(make_row(c))
            bin_utils.report_progress(log.info, r_state, 1.0)

    # Final report
    r_state[1] -= 1
    bin_utils.report_progress(log.info, r_state, 0)

    log.info("Done")
Esempio n. 44
0
                while i < total:
                    b_start = i
                    b_end = i + batch_size
                    for h in q[b_start:b_end].execute():
                        # noinspection PyProtectedMember
                        yield h.meta._d_
                        i += 1
            except elasticsearch.ConnectionTimeout, ex:
                log.warning("ElasticSearch timed out (error = %s)", str(ex))
                restart = True
                log.debug("Restarting query from index %d", i)

    log.info("Initializing image download/record parallel iterator")
    img_dl_records = parallel_map(dl_image,
                                  iter_scan_meta(),
                                  name='image_download',
                                  use_multiprocessing=True,
                                  cores=cores)

    # Write out
    log.info("Starting iteration/file-write")
    rp_state = [0] * 7
    with open(scan_record, 'w') as record_file:
        for r in img_dl_records:
            if r is not None:
                cdr_id, local_path, uuid = r
                record_file.write('%s,%s,%s\n' % (cdr_id, local_path, uuid))
            report_progress(log.debug, rp_state, 1.0)
        # Final report
        rp_state[1] -= 1
        report_progress(log.debug, rp_state, 0)
Esempio n. 45
0
def compute_distance_kernel(m, dist_func, row_wise=False, parallel=True):
    """
    Method for computing the distance kernel of an array of vectors given a
    distance function that works on two supplied 1D arrays.

    For a valid distance function interface, see
    ``smqtk.utils.distance_functions.histogram_intersection_distance2``.

    :param m: An array of vectors to compute the pairwise distance kernel for.
    :type m: numpy.core.multiarray.ndarray

    :param dist_func: Distance function
    :type dist_func: (ndarray, ndarray) -> ndarray[float] | float

    :param row_wise: If the given distance function can take a vector and a
        matrix, and computes pair-wise distances, returning a vector of
        distances between the given vector and each row of the matrix.
    :type row_wise: bool

    :param parallel: If distances should be calculated in parallel. This is true
        by default.
    :type parallel: bool

    :return: Computed symmetric distance kernel
    :rtype: numpy.core.multiarray.ndarray

    """
    if hasattr(dist_func, 'im_func'):
        # noinspection PyUnresolvedReferences
        distance_name = '.'.join([
            dist_func.__module__, dist_func.im_class.__name__,
            dist_func.im_func.func_name
        ])
    elif hasattr(dist_func, 'func_name'):
        # noinspection PyUnresolvedReferences
        distance_name = '.'.join([dist_func.__module__, dist_func.func_name])
    elif hasattr(dist_func, 'py_func') \
            and hasattr(dist_func.py_func, 'func_name'):
        distance_name = '.'.join(
            [dist_func.__module__, dist_func.py_func.func_name])
    else:
        distance_name = "<unknown>"
    log = logging.getLogger('compute_distance_kernel[%s]' % distance_name)

    if m.ndim == 1:
        m = m[np.newaxis]

    log.info("Computing distance kernel")
    side = m.shape[0]
    mat = np.ndarray((side, side), dtype=float)

    if row_wise:
        log.debug("Computing row-wise distances")
        # For all rows except the last one. We'll have computed all distanced by
        # the time reach m[side-1]
        if parallel:

            def work_func(i):
                mat[i, i] = 0.
                if i < (side - 1):
                    mat[i + 1:,
                        i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :])

            # Using threading for in-place modification
            s = [0] * 7
            for _ in parallel_map(work_func,
                                  xrange(side),
                                  use_multiprocessing=False):
                report_progress(log.debug, s, 1.)
        else:
            for i in xrange(side):
                # Compute col/row wise distances
                mat[i, i] = 0.
                if i < (side - 1):
                    mat[i + 1:,
                        i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :])
    else:
        log.debug("Computing element-wise distances")
        if parallel:

            def work_func(i):
                mat[i, i] = 0
                # cols to the left of diagonal index for this row
                for j in xrange(i):
                    mat[i, j] = mat[j, i] = dist_func(m[i], m[j])

            # Using threading for in-place modification
            s = [0] * 7
            for _ in parallel_map(work_func,
                                  xrange(side),
                                  use_multiprocessing=False):
                report_progress(log.debug, s, 1.)
        else:
            for i in xrange(side):
                mat[i, i] = 0
                # cols to the left of diagonal index for this row
                for j in xrange(i):
                    mat[i, j] = mat[j, i] = dist_func(m[i], m[j])

    return mat
Esempio n. 46
0
def compute_distance_kernel(m, dist_func, row_wise=False, parallel=True):
    """
    Method for computing the distance kernel of an array of vectors given a
    distance function that works on two supplied 1D arrays.

    For a valid distance function interface, see
    ``smqtk.utils.distance_functions.histogram_intersection_distance2``.

    :param m: An array of vectors to compute the pairwise distance kernel for.
    :type m: numpy.ndarray

    :param dist_func: Distance function
    :type dist_func: (ndarray, ndarray) -> ndarray[float] | float

    :param row_wise: If the given distance function can take a vector and a
        matrix, and computes pair-wise distances, returning a vector of
        distances between the given vector and each row of the matrix.
    :type row_wise: bool

    :param parallel: If distances should be calculated in parallel. This is true
        by default.
    :type parallel: bool

    :return: Computed symmetric distance kernel
    :rtype: numpy.ndarray

    """
    log = logging.getLogger(__name__)

    if m.ndim == 1:
        m = m[np.newaxis]

    log.info("Computing distance kernel")
    side = m.shape[0]
    mat = np.ndarray((side, side), dtype=float)

    s = [0] * 7
    if row_wise:
        log.debug("Computing row-wise distances")
        # For all rows except the last one. We'll have computed all distances by
        # the time reach m[side-1]
        if parallel:
            # noinspection PyShadowingNames
            def work_func(i):
                mat[i, i] = dist_func(m[i], m[i])
                if i < (side - 1):
                    mat[i + 1:, i] = mat[i, i + 1:] = dist_func(m[i, :],
                                                                m[i + 1:, :])
            # Using threading for in-place modification
            s = [0] * 7
            for _ in parallel_map(work_func, range(side),
                                  use_multiprocessing=False):
                report_progress(log.debug, s, 1.)
        else:
            for i in range(side):
                # Compute col/row wise distances
                mat[i, i] = dist_func(m[i], m[i])
                if i < (side-1):
                    mat[i+1:, i] = mat[i, i+1:] = dist_func(m[i, :], m[i+1:, :])
                report_progress(log.debug, s, 1.)
    else:
        log.debug("Computing element-wise distances")
        if parallel:
            # noinspection PyShadowingNames
            def work_func(i):
                mat[i, i] = dist_func(m[i], m[i])
                # cols to the left of diagonal index for this row
                for j in range(i):
                    mat[i, j] = mat[j, i] = dist_func(m[i], m[j])
            # Using threading for in-place modification
            for _ in parallel_map(work_func, range(side),
                                  use_multiprocessing=False):
                report_progress(log.debug, s, 1.)
        else:
            for i in range(side):
                mat[i, i] = dist_func(m[i], m[i])
                # cols to the left of diagonal index for this row
                for j in range(i):
                    mat[i, j] = mat[j, i] = dist_func(m[i], m[j])
                report_progress(log.debug, s, 1.)

    return mat
Esempio n. 47
0
    def classify_async(self,
                       d_iter,
                       factory,
                       overwrite=False,
                       procs=None,
                       use_multiprocessing=False,
                       ri=None):
        """
        Asynchronously classify the DescriptorElements in the given iterable.

        :param d_iter: Iterable of DescriptorElements
        :type d_iter:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param factory: Classifier element factory to use for element
            generation.
        :type factory: smqtk.representation.ClassificationElementFactory

        :param overwrite: Recompute classification of the input descriptor and
            set the results to the ClassificationElement produced by the
            factory.
        :type overwrite: bool

        :param procs: Explicit number of cores/thread/processes to use.
        :type procs: None | int

        :param use_multiprocessing: Use ``multiprocessing.pool.Pool`` instead of
            ``multiprocessing.pool.ThreadPool``.
        :type use_multiprocessing: bool

        :param ri: Progress reporting interval in seconds. Set to a value > 0 to
            enable. Disabled by default.
        :type ri: float | None

        :return: Mapping of input DescriptorElement instances to the computed
            ClassificationElement. ClassificationElement UUID's are congruent
            with the UUID of the DescriptorElement
        :rtype: dict[smqtk.representation.DescriptorElement,
                     smqtk.representation.ClassificationElement]

        """
        self._log.debug("Async classifying descriptors")
        ri = ri and ri > 0 and ri

        def work(d_elem):
            return d_elem, self.classify(d_elem, factory, overwrite)

        classifications = parallel.parallel_map(
            work,
            d_iter,
            cores=procs,
            ordered=False,
            use_multiprocessing=use_multiprocessing,
        )

        r_state = [0] * 7
        if ri:
            r_progress = bin_utils.report_progress
        else:

            def r_progress(*_):
                return

        d2c_map = {}
        for d, c in classifications:
            d2c_map[d] = c

            r_progress(self._log.debug, r_state, ri)

        return d2c_map
Esempio n. 48
0
 def test_simple_unordered_threaded(self):
     r = list(parallel_map(self.test_func, self.test_string,
                           ordered=False, use_multiprocessing=False))
     nose.tools.assert_equal(set(r), set(self.expected))
Esempio n. 49
0
def main():
    description = """
    Utility for validating a given classifier implementation's model against
    some labeled testing data, outputting PR and ROC curve plots with
    area-under-curve score values.

    This utility can optionally be used train a supervised classifier model if
    the given classifier model configuration does not exist and a second CSV
    file listing labeled training data is provided. Training will be attempted
    if ``train`` is set to true. If training is performed, we exit after
    training completes. A ``SupervisedClassifier`` sub-classing implementation
    must be configured

    We expect the test and train CSV files in the column format:

        ...
        <UUID>,<label>
        ...

    The UUID is of the descriptor to which the label applies. The label may be
    any arbitrary string value, but all labels must be consistent in
    application.

    Some metrics presented assume the highest confidence class as the single
    predicted class for an element:

        - confusion matrix

    The output UUID confusion matrix is a JSON dictionary where the top-level
    keys are the true labels, and the inner dictionary is the mapping of
    predicted labels to the UUIDs of the classifications/descriptors that
    yielded the prediction. Again, this is based on the maximum probability
    label for a classification result (T=0.5).
    """
    args, config = bin_utils.utility_main_helper(default_config, description)
    log = logging.getLogger(__name__)

    #
    # Initialize stuff from configuration
    #
    #: :type: smqtk.algorithms.Classifier
    classifier = plugin.from_plugin_config(
        config['plugins']['classifier'],
        get_classifier_impls()
    )
    #: :type: ClassificationElementFactory
    classification_factory = ClassificationElementFactory.from_config(
        config['plugins']['classification_factory']
    )
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(
        config['plugins']['descriptor_index'],
        get_descriptor_index_impls()
    )

    uuid2label_filepath = config['utility']['csv_filepath']
    do_train = config['utility']['train']
    output_uuid_cm = config['utility']['output_uuid_confusion_matrix']
    plot_filepath_pr = config['utility']['output_plot_pr']
    plot_filepath_roc = config['utility']['output_plot_roc']
    plot_filepath_cm = config['utility']['output_plot_confusion_matrix']
    plot_ci = config['utility']['curve_confidence_interval']
    plot_ci_alpha = config['utility']['curve_confidence_interval_alpha']

    #
    # Construct mapping of label to the DescriptorElement instances for that
    # described by that label.
    #
    log.info("Loading descriptors by UUID")

    def iter_uuid_label():
        """ Iterate through UUIDs in specified file """
        with open(uuid2label_filepath) as uuid2label_file:
            reader = csv.reader(uuid2label_file)
            for r in reader:
                # TODO: This will need to be updated to handle multiple labels
                #       per descriptor.
                yield r[0], r[1]

    def get_descr(r):
        """ Fetch descriptors from configured index """
        uuid, truth_label = r
        return truth_label, descriptor_index.get_descriptor(uuid)

    tlabel_element_iter = parallel.parallel_map(
        get_descr, iter_uuid_label(),
        name="cmv_get_descriptors",
        use_multiprocessing=True,
        cores=config['parallelism']['descriptor_fetch_cores'],
    )

    # Map of truth labels to descriptors of labeled data
    #: :type: dict[str, list[smqtk.representation.DescriptorElement]]
    tlabel2descriptors = {}
    for tlabel, d in tlabel_element_iter:
        tlabel2descriptors.setdefault(tlabel, []).append(d)

    # Train classifier if the one given has a ``train`` method and training
    # was turned enabled.
    if do_train:
        if isinstance(classifier, SupervisedClassifier):
            log.info("Training classifier model")
            classifier.train(tlabel2descriptors)
            exit(0)
        else:
            ValueError("Configured classifier is not a SupervisedClassifier "
                       "type and does not support training.")

    #
    # Apply classifier to descriptors for predictions
    #

    # Truth label to predicted classification results
    #: :type: dict[str, set[smqtk.representation.ClassificationElement]]
    tlabel2classifications = {}
    for tlabel, descriptors in tlabel2descriptors.iteritems():
        tlabel2classifications[tlabel] = \
            set(classifier.classify_async(
                descriptors, classification_factory,
                use_multiprocessing=True,
                procs=config['parallelism']['classification_cores'],
                ri=1.0,
            ).values())
    log.info("Truth label counts:")
    for l in sorted(tlabel2classifications):
        log.info("  %s :: %d", l, len(tlabel2classifications[l]))

    #
    # Confusion Matrix
    #
    conf_mat, labels = gen_confusion_matrix(tlabel2classifications)
    log.info("Confusion_matrix")
    log_cm(log.info, conf_mat, labels)
    if plot_filepath_cm:
        plot_cm(conf_mat, labels, plot_filepath_cm)

    # CM of descriptor UUIDs to output json
    if output_uuid_cm:
        # Top dictionary keys are true labels, inner dictionary keys are UUID
        # predicted labels.
        log.info("Computing UUID Confusion Matrix")
        #: :type: dict[str, dict[str, set | list]]
        uuid_cm = {}
        for tlabel in tlabel2classifications:
            uuid_cm[tlabel] = collections.defaultdict(set)
            for c in tlabel2classifications[tlabel]:
                uuid_cm[tlabel][c.max_label()].add(c.uuid)
            # convert sets to lists
            for plabel in uuid_cm[tlabel]:
                uuid_cm[tlabel][plabel] = list(uuid_cm[tlabel][plabel])
        with open(output_uuid_cm, 'w') as f:
            log.info("Saving UUID Confusion Matrix: %s", output_uuid_cm)
            json.dump(uuid_cm, f, indent=2, separators=(',', ': '))


    #
    # Create PR/ROC curves via scikit learn tools
    #
    if plot_filepath_pr:
        log.info("Making PR curve")
        make_pr_curves(tlabel2classifications, plot_filepath_pr,
                       plot_ci, plot_ci_alpha)
    if plot_filepath_roc:
        log.info("Making ROC curve")
        make_roc_curves(tlabel2classifications, plot_filepath_roc,
                        plot_ci, plot_ci_alpha)
Esempio n. 50
0
def mb_kmeans_build_apply(index, mbkm, initial_fit_size):
    """
    Build the MiniBatchKMeans centroids based on the descriptors in the given
    index, then predicting descriptor clusters with the final result model.

    If the given index is empty, no fitting or clustering occurs and an empty
    dictionary is returned.

    :param index: Index of descriptors
    :type index: smqtk.representation.DescriptorIndex

    :param mbkm: Scikit-Learn MiniBatchKMeans instead to train and then use for
        prediction
    :type mbkm: sklearn.cluster.MiniBatchKMeans

    :param initial_fit_size: Number of descriptors to run an initial fit with.
        This brings the advantage of choosing a best initialization point from
        multiple.
    :type initial_fit_size: int

    :return: Dictionary of the cluster label (integer) to the set of descriptor
        UUIDs belonging to that cluster.
    :rtype: dict[int, set[collections.Hashable]]

    """
    log = logging.getLogger(__name__)

    ifit_completed = False
    k_deque = collections.deque()
    d_fitted = 0

    log.info("Getting index keys (shuffled)")
    index_keys = sorted(six.iterkeys(index))
    numpy.random.seed(mbkm.random_state)
    numpy.random.shuffle(index_keys)

    def parallel_iter_vectors(descriptors):
        """ Get the vectors for the descriptors given.
        Not caring about order returned.
        """
        return parallel.parallel_map(lambda d: d.vector(), descriptors,
                                     use_multiprocessing=False)

    def get_vectors(k_iter):
        """ Get numpy array of descriptor vectors (2D array returned) """
        return numpy.array(list(
            parallel_iter_vectors(index.get_many_descriptors(k_iter))
        ))

    log.info("Collecting iteratively fitting model")
    rps = [0] * 7
    for i, k in enumerate(index_keys):
        k_deque.append(k)
        bin_utils.report_progress(log.debug, rps, 1.)

        if initial_fit_size and not ifit_completed:
            if len(k_deque) == initial_fit_size:
                log.info("Initial fit using %d descriptors", len(k_deque))
                log.info("- collecting vectors")
                vectors = get_vectors(k_deque)
                log.info("- fitting model")
                mbkm.fit(vectors)
                log.info("- cleaning")
                d_fitted += len(vectors)
                k_deque.clear()
                ifit_completed = True
        elif len(k_deque) == mbkm.batch_size:
            log.info("Partial fit with batch size %d", len(k_deque))
            log.info("- collecting vectors")
            vectors = get_vectors(k_deque)
            log.info("- fitting model")
            mbkm.partial_fit(vectors)
            log.info("- cleaning")
            d_fitted += len(k_deque)
            k_deque.clear()

    # Final fit with any remaining descriptors
    if k_deque:
        log.info("Final partial fit of size %d", len(k_deque))
        log.info('- collecting vectors')
        vectors = get_vectors(k_deque)
        log.info('- fitting model')
        mbkm.partial_fit(vectors)
        log.info('- cleaning')
        d_fitted += len(k_deque)
        k_deque.clear()

    log.info("Computing descriptor classes with final KMeans model")
    mbkm.verbose = False
    d_classes = collections.defaultdict(set)
    d_uv_iter = parallel.parallel_map(lambda d: (d.uuid(), d.vector()),
                                      index,
                                      use_multiprocessing=False,
                                      name="uv-collector")
    # TODO: Batch predict call inputs to something larger than one at a time.
    d_uc_iter = parallel.parallel_map(
        lambda u_v: (u_v[0], mbkm.predict(u_v[1][numpy.newaxis, :])[0]),
        d_uv_iter,
        use_multiprocessing=False,
        name="uc-collector")
    rps = [0] * 7
    for uuid, c in d_uc_iter:
        d_classes[c].add(uuid)
        bin_utils.report_progress(log.debug, rps, 1.)
    rps[1] -= 1
    bin_utils.report_progress(log.debug, rps, 0)

    return d_classes
Esempio n. 51
0
    def _classify_arrays(self, array_iter):
        if not self.has_model():
            raise RuntimeError("No SVM model present for classification")
        assert self.svm_model is not None, (
            "Should have an SVM model at this point."
        )

        # Dump descriptors into a matrix for normalization and use in
        # prediction.
        vec_mat = numpy.array(list(array_iter))
        vec_mat = self._norm_vector(vec_mat)
        n_jobs = self.n_jobs
        if n_jobs is not None:
            n_jobs = min(len(vec_mat), n_jobs)
        # Else: `n_jobs` is `None`, which is OK as it's the default  value for
        # parallel_map.

        svm_label_map = self.svm_label_map
        c_base = dict((la, 0.) for la in svm_label_map.values())

        # Effectively reproducing the body of svmutil.svm_predict in order to
        # simplify and get around excessive prints
        svm_type = self.svm_model.get_svm_type()
        nr_class = self.svm_model.get_nr_class()
        # Model internal labels. Parallel to ``prob_estimates`` array.
        svm_model_labels = self.svm_model.get_labels()

        # TODO: Normalize input arrays in batch(es). TEST if current norm
        #       function can just take a matrix?

        if self.svm_model.is_probability_model():
            # noinspection PyUnresolvedReferences
            if svm_type in [svm.NU_SVR, svm.EPSILON_SVR]:
                nr_class = 0

            def single_pred(v):
                prob_estimates = (ctypes.c_double * nr_class)()
                v, idx = svm.gen_svm_nodearray(v.tolist())
                svm.libsvm.svm_predict_probability(self.svm_model, v,
                                                   prob_estimates)
                c = dict(c_base)  # Shallow copy
                c.update({svm_label_map[label]: prob for label, prob
                          in zip(svm_model_labels, prob_estimates[:nr_class])})
                return c
            # If n_jobs == 1, just be serial
            if n_jobs == 1:
                return (single_pred(v) for v in vec_mat)
            else:
                return parallel_map(single_pred, vec_mat,
                                    cores=n_jobs,
                                    use_multiprocessing=True)

        else:
            # noinspection PyUnresolvedReferences
            if svm_type in (svm.ONE_CLASS, svm.EPSILON_SVR, svm.NU_SVC):
                nr_classifier = 1
            else:
                nr_classifier = nr_class * (nr_class - 1) // 2

            def single_label(v):
                dec_values = (ctypes.c_double * nr_classifier)()
                v, idx = svm.gen_svm_nodearray(v.tolist())
                label = svm.libsvm.svm_predict_values(self.svm_model, v,
                                                      dec_values)
                c = dict(c_base)  # Shallow copy
                c[svm_label_map[label]] = 1.
                return c
            # If n_jobs == 1, just be serial
            if n_jobs == 1:
                return (single_label(v) for v in vec_mat)
            else:
                return parallel_map(single_label, vec_mat,
                                    cores=n_jobs,
                                    use_multiprocessing=True)
Esempio n. 52
0
 def test_simple_ordered_threaded(self):
     # Make sure results are still in order as requested
     r = list(parallel_map(self.test_func, self.test_string,
                           ordered=True, use_multiprocessing=False))
     self.assertEqual(r, self.expected)
Esempio n. 53
0
def compute_hash_codes(uuids, index, functor, report_interval=1.0,
                       use_mp=False, ordered=False):
    """
    Given an iterable of DescriptorElement UUIDs, asynchronously access them
    from the given ``index``, asynchronously compute hash codes via ``functor``
    and convert to an integer, yielding (UUID, hash-int) pairs.

    :param uuids: Sequence of UUIDs to process
    :type uuids: collections.Iterable[collections.Hashable]

    :param index: Descriptor index to pull from.
    :type index: smqtk.representation.descriptor_index.DescriptorIndex

    :param functor: LSH hash code functor instance
    :type functor: smqtk.algorithms.LshFunctor

    :param report_interval: Frequency in seconds at which we report speed and
        completion progress via logging. Reporting is disabled when logging
        is not in debug and this value is greater than 0.
    :type report_interval: float

    :param use_mp: If multiprocessing should be used for parallel
        computation vs. threading. Reminder: This will copy currently loaded
        objects onto worker processes (e.g. the given index), which could lead
        to dangerously high RAM consumption.
    :type use_mp: bool

    :param ordered: If the element-hash value pairs yielded are in the same
        order as element UUID values input. This function should be slightly
        faster when ordering is not required.
    :type ordered: bool

    :return: Generator instance yielding (DescriptorElement, int) value pairs.

    """
    # TODO: parallel map fetch elements from index?
    #       -> separately from compute

    def get_hash(u):
        v = index.get_descriptor(u).vector()
        return u, bits.bit_vector_to_int_large(functor.get_hash(v))

    # Setup log and reporting function
    log = logging.getLogger(__name__)

    if log.getEffectiveLevel() > logging.DEBUG or report_interval <= 0:
        def log_func(*_, **__):
            return
        log.debug("Not logging progress")
    else:
        log.debug("Logging progress at %f second intervals", report_interval)
        log_func = log.debug

    log.debug("Starting computation")
    reporter = cli.ProgressReporter(log_func, report_interval)
    reporter.start()
    for uuid, hash_int in parallel.parallel_map(get_hash, uuids,
                                                ordered=ordered,
                                                use_multiprocessing=use_mp):
        yield (uuid, hash_int)
        # Progress reporting
        reporter.increment_report()

    # Final report
    reporter.report()
Esempio n. 54
0
 def test_simple_unordered_multiprocess(self):
     r = list(parallel_map(self.test_func, self.test_string,
                           ordered=False, use_multiprocessing=True))
     self.assertEqual(set(r), set(self.expected))
Esempio n. 55
0
    def _generate_arrays(self, data_iter):
        """
        Inner template method that defines the generation of descriptor vectors
        for a given iterable of data elements.

        Pre-conditions:
          - Data elements input to this method have been validated to be of at
            least one of this class's reported ``valid_content_types``.

        :param collections.Iterable[DataElement] data_iter:
            Iterable of data element instances to be described.

        :raises RuntimeError: Descriptor extraction failure of some kind.

        :return: Iterable of numpy arrays in parallel association with the
            input data elements.
        :rtype: collections.Iterable[numpy.ndarray]
        """
        log_debug = self._log.debug

        # Start parallel operation to pre-process imagery before aggregating
        # for network execution.
        # TODO: update ``buffer_factor`` param to account for batch size?
        img_array_iter = \
            parallel_map(_process_load_img_array,
                         zip(
                             data_iter, itertools.repeat(self.transformer),
                             itertools.repeat(self.data_layer),
                             itertools.repeat(self.load_truncated_images),
                             itertools.repeat(self.pixel_rescale),
                         ),
                         ordered=True, cores=self.threads)

        # Aggregate and process batches of input data elements
        #: :type: list[numpy.ndarray]
        batch_img_arrays = \
            list(itertools.islice(img_array_iter, self.batch_size))
        batch_i = 0
        while len(batch_img_arrays) > 0:
            cur_batch_size = len(batch_img_arrays)
            log_debug("Batch {} - size {}".format(batch_i, cur_batch_size))

            log_debug("Loading image numpy array into KWCNN Data object")
            self.data.set_data_list(batch_img_arrays, quiet=True)

            log_debug("Performing forward inference using KWCNN Network")
            test_results = self.network.test(quiet=True)
            descriptor_list = test_results['probability_list']

            for v in descriptor_list:
                if v.ndim > 1:
                    # In case kwcnn generates multidimensional array
                    # (rows, 1, 1)
                    log_debug("- Raveling output array of shape {}".format(
                        v.shape))
                    yield numpy.ravel(v)
                else:
                    yield v

            # Slice out the next batch
            #: :type: list[(collections.Hashable, numpy.ndarray)]
            batch_img_arrays = \
                list(itertools.islice(img_array_iter, self.batch_size))
            batch_i += 1
Esempio n. 56
0
 def test_simple_ordered_multiprocess(self):
     r = list(parallel_map(self.test_func, self.test_string,
                           ordered=True, use_multiprocessing=True))
     nose.tools.assert_equal(r, self.expected)