Esempio n. 1
0
    def runImageSimilaritySearch(self, params):
        assert hasattr(self, 'search_url')
        classifications = json.loads(
            params['classifications']) if 'classifications' in params else []
        params['n'] = params['n'] if 'n' in params else str(DEFAULT_PAGE_SIZE)
        image, _type = base64FromUrl(params['url'])
        smqtk_r = requests.get(self.search_url + '/n=' + params['n'] +
                               '/base64://' + image + '?content_type=' + _type)
        assert smqtk_r.ok
        smqtk_r = smqtk_r.json()
        neighbors_to_distances = dict(
            zip(smqtk_r['neighbors'], smqtk_r['distances']))
        documents = solr_documents_from_field('sha1sum_s_md',
                                              neighbors_to_distances.keys(),
                                              classifications)

        for document in documents:
            document['smqtk_distance'] = neighbors_to_distances[
                document['sha1sum_s_md']]

        if 'near_duplicates' in params and int(params['near_duplicates']) == 1:
            documents = [
                x for x in documents
                if x['smqtk_distance'] <= NEAR_DUPLICATES_THRESHOLD
            ]

        documents = sorted(
            documents, key=lambda x: x['smqtk_distance'])[:int(params['n'])]

        return {'numFound': len(documents), 'docs': documents}
Esempio n. 2
0
    def results(self, params):
        offset = int(params['offset'] if 'offset' in params else 0)
        limit = int(params['limit'] if 'limit' in params else 20)

        resp = requests.get(self.search_url + '/get_results', params={
            'sid': params['sid'],
            'i': offset,
            'j': offset + limit
        }).json() # @todo handle errors

        documents = solr_documents_from_field('sha1sum_s_md', [sha for (sha, _) in resp['results']])

        # The documents from Solr (since shas map to >= 1 document) may not be in the order of confidence
        # returned by IQR, sort the documents to match the confidence values.
        # Sort by confidence values first, then sha checksums second so duplicate images are grouped together
        confidenceValues = dict(resp['results'])  # Mapping of sha -> confidence values

        if len(documents) < len(resp['results']):
            logger.error('SID %s: There are SMQTK descriptors that have no corresponding Solr document(s).' % params['sid'])

        for document in documents:
            document['smqtk_iqr_confidence'] = confidenceValues[document['sha1sum_s_md']]

        return {
            'numFound': resp['total_results'],
            'docs': sorted(documents,
                           key=lambda x: (x['smqtk_iqr_confidence'],
                                          x['sha1sum_s_md']),
                           reverse=True)
        }
Esempio n. 3
0
    def _search(self, params):
        assert hasattr(self, 'search_url')

        classifications = json.loads(
            params['classifications']) if 'classifications' in params else []
        cmu_images = requests.post(self.search_url,
                                   data=params['url'],
                                   headers={
                                       'Content-type': 'text',
                                       'Content-length':
                                       str(len(params['url']))
                                   },
                                   verify=False).json()

        cmu_images = [[
            image.replace(setting.get('IMAGE_SPACE_CMU_PREFIX'),
                          setting.get('IMAGE_SPACE_SOLR_PREFIX')), score
        ] for (image, score) in cmu_images]
        cmu_scores = {image.lower(): score for image, score in cmu_images}

        documents = solr_documents_from_field('resourcename_t_md',
                                              [x[0] for x in cmu_images],
                                              classifications)

        # Augment original scores from response into solr documents
        for document in documents:
            document['im_score'] = cmu_scores[document['id'].lower()]

        return {'numFound': len(documents), 'docs': documents}
Esempio n. 4
0
    def _search(self, params):
        assert hasattr(self, 'search_url')

        classifications = json.loads(params['classifications']) if 'classifications' in params else []
        cmu_images = requests.post(self.search_url,
                                   data=params['url'],
                                   headers={
                                       'Content-type': 'text',
                                       'Content-length': str(len(params['url']))
                                   },
                                   verify=False).json()

        cmu_images = [[image.replace(os.environ['IMAGE_SPACE_CMU_PREFIX'],
                                     os.environ['IMAGE_SPACE_SOLR_PREFIX']), score]
                      for (image, score) in cmu_images]
        cmu_scores = {image.lower(): score for image, score in cmu_images}

        documents = solr_documents_from_field('resourcename_t_md', [x[0] for x in cmu_images], classifications)

        # Augment original scores from response into solr documents
        for document in documents:
            document['im_score'] = cmu_scores[document['id'].lower()]

        return {
            'numFound': len(documents),
            'docs': documents
        }
Esempio n. 5
0
    def runImageSimilaritySearch(self, params):
        assert hasattr(self, 'search_url')
        classifications = json.loads(params['classifications']) if 'classifications' in params else []
        params['n'] = params['n'] if 'n' in params else str(DEFAULT_PAGE_SIZE)
        smqtk_r = requests.get(self.search_url + '/n=' + params['n'] + '/' + params['url']).json()
        neighbors_to_distances = dict(zip(smqtk_r['neighbors'], smqtk_r['distances']))
        documents = solr_documents_from_field('sha1sum_s_md',
                                              neighbors_to_distances.keys(),
                                              classifications)

        for document in documents:
            document['smqtk_distance'] = neighbors_to_distances[document['sha1sum_s_md']]

        if 'near_duplicates' in params and int(params['near_duplicates']) == 1:
            documents = [x for x in documents if x['smqtk_distance'] <= NEAR_DUPLICATES_THRESHOLD]

        return {
            'numFound': len(documents),
            'docs': documents
        }
Esempio n. 6
0
    def results(self, params):
        offset = int(params['offset'] if 'offset' in params else 0)
        limit = int(params['limit'] if 'limit' in params else 20)

        resp = requests.get(self.search_url + '/get_results',
                            params={
                                'sid': params['sid'],
                                'i': offset,
                                'j': offset + limit
                            }).json()  # @todo handle errors

        documents = solr_documents_from_field(
            'sha1sum_s_md', [sha for (sha, _) in resp['results']])

        # The documents from Solr (since shas map to >= 1 document) may not be in the order of confidence
        # returned by IQR, sort the documents to match the confidence values.
        # Sort by confidence values first, then sha checksums second so duplicate images are grouped together
        confidenceValues = dict(
            resp['results'])  # Mapping of sha -> confidence values

        if len(documents) < len(resp['results']):
            logger.error(
                'SID %s: There are SMQTK descriptors that have no corresponding Solr document(s).'
                % params['sid'])

        for document in documents:
            document['smqtk_iqr_confidence'] = confidenceValues[
                document['sha1sum_s_md']]

        return {
            'numFound':
            resp['total_results'],
            'docs':
            sorted(documents,
                   key=lambda x:
                   (x['smqtk_iqr_confidence'], x['sha1sum_s_md']),
                   reverse=True)
        }
Esempio n. 7
0
    def results(self, params):
        def sid_exists(sid):
            """
            Determine if a session ID already exists in SMQTK.
            This currently creates the session if it doesn't already exist.
            """
            return not requests.post(self.search_url + '/session',
                                     data={
                                         'sid': params['sid']
                                     }).ok

        offset = int(params['offset'] if 'offset' in params else 0)
        limit = int(params['limit'] if 'limit' in params else 20)

        if not sid_exists(params['sid']):
            # Get pos/neg uuids from current session
            session = self.model('item').findOne({'meta.sid': params['sid']})

            if session:
                self._refine({
                    'sid': params['sid'],
                    'pos_uuids': session['meta']['pos_uuids'],
                    'neg_uuids': session['meta']['neg_uuids']
                })

        resp = requests.get(self.search_url + '/get_results',
                            params={
                                'sid': params['sid'],
                                'i': offset,
                                'j': offset + limit
                            }).json()  # @todo handle errors

        try:
            documents = solr_documents_from_field(
                'sha1sum_s_md', [sha for (sha, _) in resp['results']])
        except KeyError:
            return {'numFound': 0, 'docs': []}

        # The documents from Solr (since shas map to >= 1 document) may not be in the order of confidence
        # returned by IQR, sort the documents to match the confidence values.
        # Sort by confidence values first, then sha checksums second so duplicate images are grouped together
        confidenceValues = dict(
            resp['results'])  # Mapping of sha -> confidence values

        if len(documents) < len(resp['results']):
            logger.error(
                'SID %s: There are SMQTK descriptors that have no corresponding Solr document(s).'
                % params['sid'])

        for document in documents:
            document['smqtk_iqr_confidence'] = confidenceValues[
                document['sha1sum_s_md']]

        return {
            'numFound':
            resp['total_results'],
            'docs':
            sorted(documents,
                   key=lambda x:
                   (x['smqtk_iqr_confidence'], x['sha1sum_s_md']),
                   reverse=True)
        }