Example #1
0
def main():
    import optparse
    usage = "Usage: %prog [options] GLOB1 [ GLOB2 ... ]"
    parser = optparse.OptionParser(usage)
    parser.add_option('-d', '--data-dir',
                      help="Non-standard directory to treat as the base data "
                           "directory.")
    opts, args = parser.parse_args()

    log = logging.getLogger("main")
    data_dir = opts.data_dir or masir_config.DIR_DATA

    source_files = []
    for g in args:
        source_files.extend(glob.glob(g))

    if not source_files:
        raise ValueError("No files found with the supplied globs.")

    im = IngestManager(data_dir)
    for f in source_files:
        # Expect to possibly find some bson files in here. Skip them.
        if osp.splitext(f)[1] == '.bson':
            continue

        try:
            # if there's a found paired BSON file, pass that too
            md_filepath = osp.splitext(f)[0] + ".bson"
            if not osp.exists(md_filepath):
                md_filepath = None

            im.ingest_image(f, md_filepath)
        except IOError:
            log.warn("Not an image file: %s", f)
            continue
        except bson.InvalidBSON, ex:
            log.warn("BSON Error: %s", str(ex))
        except Exception, ex:
            log.warn("Other exception caught for file '%s':\n"
                     "    %s",
                     f, str(ex))
Example #2
0
    def __init__(self, base_work_dir, descriptor):
        """ Initialize IQR session

        :param base_work_dir: Base directory to put working files into
        :type base_work_dir: str
        :param descriptor: FeatureDescriptor to use for this IQR session
        :type descriptor: masir.search.FeatureDescriptor.FeatureDescriptor

        """
        self._session_lock = multiprocessing.RLock()

        self.uuid = uuid.uuid1()
        self.positive_ids = set()
        self.negative_ids = set()

        self.work_dir = osp.join(osp.abspath(osp.expanduser(base_work_dir)),
                                 'iqr', 'session-%s' % str(self.uuid))
        if not osp.isdir(self.work_dir):
            os.makedirs(self.work_dir)

        #: :type: dict of (str, FeatureDescriptor)
        self.descriptor = descriptor

        # noinspection PyTypeChecker
        self.feature_memory = FeatureMemory.construct_from_files(
            descriptor.ids_file, descriptor.bg_flags_file,
            descriptor.feature_data_file, descriptor.kernel_data_file
        )
        # noinspection PyProtectedMember
        self._original_fm_bgid_set = self.feature_memory._bg_clip_ids

        # Mapping of a clip ID to the probability of it being associated to
        # positive adjudications. This is None before any refinement occurs.
        #: :type: None or dict of (int, float)
        self.results = None

        self.svm_train_params = '-q -t 4 -b 1 -w1 50 -c 100'

        # Ingest where extension images are placed
        extension_ingest_dir = osp.join(self.work_dir, "extension_ingest")
        self.extension_ingest = IngestManager(
            extension_ingest_dir,
            self.feature_memory.get_ids().max() + 1
        )
Example #3
0
class IqrSession (object):

    @property
    def _log(self):
        return logging.getLogger(
            '.'.join((self.__module__, self.__class__.__name__))
            + "[%s]" % self.uuid
        )

    def __init__(self, base_work_dir, descriptor):
        """ Initialize IQR session

        :param base_work_dir: Base directory to put working files into
        :type base_work_dir: str
        :param descriptor: FeatureDescriptor to use for this IQR session
        :type descriptor: masir.search.FeatureDescriptor.FeatureDescriptor

        """
        self._session_lock = multiprocessing.RLock()

        self.uuid = uuid.uuid1()
        self.positive_ids = set()
        self.negative_ids = set()

        self.work_dir = osp.join(osp.abspath(osp.expanduser(base_work_dir)),
                                 'iqr', 'session-%s' % str(self.uuid))
        if not osp.isdir(self.work_dir):
            os.makedirs(self.work_dir)

        #: :type: dict of (str, FeatureDescriptor)
        self.descriptor = descriptor

        # noinspection PyTypeChecker
        self.feature_memory = FeatureMemory.construct_from_files(
            descriptor.ids_file, descriptor.bg_flags_file,
            descriptor.feature_data_file, descriptor.kernel_data_file
        )
        # noinspection PyProtectedMember
        self._original_fm_bgid_set = self.feature_memory._bg_clip_ids

        # Mapping of a clip ID to the probability of it being associated to
        # positive adjudications. This is None before any refinement occurs.
        #: :type: None or dict of (int, float)
        self.results = None

        self.svm_train_params = '-q -t 4 -b 1 -w1 50 -c 100'

        # Ingest where extension images are placed
        extension_ingest_dir = osp.join(self.work_dir, "extension_ingest")
        self.extension_ingest = IngestManager(
            extension_ingest_dir,
            self.feature_memory.get_ids().max() + 1
        )

    def __del__(self):
        # Clean up working directory
        shutil.rmtree(self.work_dir)

    @property
    def ordered_results(self):
        """
        Return a tuple of the current (id, probability) result pairs in
        order of probability score. If there are no results yet, None is
        returned.

        """
        with self._session_lock:
            if self.results:
                return tuple(sorted(self.results.iteritems(),
                                    key=lambda p: p[1],
                                    reverse=True))
            return None

    def extend_model(self, *image_files):
        """
        Extend our data models given the following image file paths.

        Given image files are added to this session's extension ingest.

        :raises ValueError: If an image file is already ingested.

        :param image_files: Iterable of image file paths
        :type image_files: Iterable of str

        """
        with self._session_lock:
            p_pool = multiprocessing.pool.Pool()

            args = []
            for img in image_files:
                uid, md5, fpath = self.extension_ingest.ingest_image(img)
                args.append((self._log.name, self.descriptor, uid, fpath))

            self._log.info("Feature generation...")
            img_features = \
                p_pool.map_async(_iqr_async_image_feature, args).get()

            p_pool.close()
            p_pool.join()

            self._log.info("Updating FM")
            new_ids = []
            for img_id, img, feat in img_features:
                self._log.info("=== %s", img)
                # TODO: Update this function in FeatureMemory to take multiple
                #       ID, feature pairs (or parallel arrays)
                self.feature_memory.update(img_id, feat)
                new_ids.append(img_id)

            # adding new IDs to positive adjudications set
            self.positive_ids.update(new_ids)

    def adjudicate(self, new_positives=(), new_negatives=(),
                   un_positives=(), un_negatives=()):
        """
        Update current state of user defined positive and negative truths on
        specific image IDs

        :param new_positives: New IDs of items to now be considered positive.
        :type new_positives: tuple of int
        :param new_negatives: New IDs of items to now be considered negative.
        :param un_positives: New item IDs that are now not positive any more.
        :type un_positives: tuple of int
        :param un_negatives: New item IDs that are now not negative any more.
        :type un_negatives: tuple of int

        """
        with self._session_lock:
            self.positive_ids.update(new_positives)
            self.positive_ids.difference_update(un_positives)
            self.positive_ids.difference_update(new_negatives)

            self.negative_ids.update(new_negatives)
            self.negative_ids.difference_update(un_negatives)
            self.negative_ids.difference_update(new_positives)

            # # EXPERIMENT
            # # When we have negative adjudications, remove use of the original
            # # bg IDs set in the feature memory, injecting this session's
            # # negative ID set (all both use set objects, so just share the ptr)
            # # When we don't have negative adjudications, reinstate the original
            # # set of bg IDs.
            # if self.negative_ids:
            #     self.feature_memory._bg_clip_ids = self.negative_ids
            # else:
            #     self.feature_memory._bg_clip_ids = self._original_fm_bgid_set

            # # Update background flags in our feature_memory
            # # - new positives and un-negatives are now non-background
            # # - new negatives are now background.
            # for uid in set(new_positives).union(un_negatives):
            #     self._log.info("Marking UID %d as non-background", uid)
            #     self.feature_memory.update(uid, is_background=False)
            #     assert uid not in self.feature_memory.get_bg_ids()
            # for uid in new_negatives:
            #     self._log.info("Marking UID %d as background", uid)
            #     self.feature_memory.update(uid, is_background=True)
            #     assert uid in self.feature_memory.get_bg_ids()

    def refine(self, new_positives=(), new_negatives=(),
               un_positives=(), un_negatives=()):
        """ Refine current model results based on current adjudication state

        :raises RuntimeError: There are no adjudications to run on. We must have
            at least one positive adjudication.

        :param new_positives: New IDs of items to now be considered positive.
        :type new_positives: tuple of int
        :param new_negatives: New IDs of items to now be considered negative.
        :param un_positives: New item IDs that are now not positive any more.
        :type un_positives: tuple of int
        :param un_negatives: New item IDs that are now not negative any more.
        :type un_negatives: tuple of int

        """
        with self._session_lock:
            self.adjudicate(new_positives, new_negatives, un_positives,
                            un_negatives)

            if not self.positive_ids:
                raise RuntimeError("Did not find at least one positive "
                                   "adjudication.")

            #
            # Model training
            #
            self._log.info("Starting model training...")
            self._log.debug("-- Positives: %s", self.positive_ids)
            self._log.debug("-- Negatives: %s", self.negative_ids)

            # query submatrix of distance kernel for positive and background
            # IDs.
            self._log.debug("Extracting symmetric submatrix")
            idx2id_map, idx_bg_flags, m = \
                self.feature_memory\
                    .get_distance_kernel()\
                    .symmetric_submatrix(*self.positive_ids)
            self._log.debug("-- num bg: %d", idx_bg_flags.count(True))
            self._log.debug("-- m shape: %s", m.shape)

            # for model training function, inverse of idx_is_bg: True
            # indicates a positively adjudicated index
            labels_train = numpy.array(tuple(not b for b in idx_bg_flags))

            # # Where to save working models
            # model_filepath = osp.join(self.work_dir,
            #                           "iqr_session.%s.model" % self.uuid)
            # svIDs_filepath = osp.join(self.work_dir,
            #                           "iqr_session.%s.svIDs" % self.uuid)

            # Returned dictionary contains the keys "model" and "clipid_SVs"
            # referring to the trained model and a list of support vectors,
            # respectively.
            ret_dict = iqr_model_train(m, labels_train, idx2id_map,
                                       self.svm_train_params)
            svm_model = ret_dict['model']
            svm_svIDs = ret_dict['clipids_SVs']

            #
            # Model Testing/Application
            #
            self._log.info("Starting model application...")

            # As we're extracting rows, the sum of IDs are preserved along
            # the x-axis (column IDs). The list of IDs along the x-axis is
            # then effectively the ordered list of all IDs
            idx2id_row, idx2id_col, kernel_test = \
                self.feature_memory.get_distance_kernel()\
                                   .extract_rows(svm_svIDs)

            # Testing/Ranking call
            #   Passing the array version of the kernel sub-matrix. The
            #   returned output['probs'] type matches the type passed in
            #   here, and using an array makes syntax cleaner.
            self._log.debug("Ranking IDs")
            output = iqr_model_test(svm_model, kernel_test.A, idx2id_col)

            probability_map = dict(zip(output['clipids'], output['probs']))
            if self.results is None:
                self.results = IqrResultsDict()
            self.results.update(probability_map)

            # Force adjudicated negatives to be probability 0.0 since we don't
            # want them possibly polluting the further adjudication views.
            for uid in self.negative_ids:
                self.results[uid] = 0.0

    def reset(self):
        """ Reset the IQR Search state

        No positive adjudications, reload original feature data

        """
        with self._session_lock:
            self.positive_ids.clear()
            self.negative_ids.clear()
            # noinspection PyUnresolvedReferences
            self.feature_memory = FeatureMemory.construct_from_files(
                self.descriptor.ids_file, self.descriptor.bg_flags_file,
                self.descriptor.feature_data_file,
                self.descriptor.kernel_data_file
            )
            self.results = None

            # clear contents of working directory
            shutil.rmtree(self.work_dir)
            os.makedirs(self.work_dir)