def runImageSimilaritySearch(self, params): assert hasattr(self, 'search_url') classifications = json.loads( params['classifications']) if 'classifications' in params else [] params['n'] = params['n'] if 'n' in params else str(DEFAULT_PAGE_SIZE) image, _type = base64FromUrl(params['url']) smqtk_r = requests.get(self.search_url + '/n=' + params['n'] + '/base64://' + image + '?content_type=' + _type) assert smqtk_r.ok smqtk_r = smqtk_r.json() neighbors_to_distances = dict( zip(smqtk_r['neighbors'], smqtk_r['distances'])) documents = solr_documents_from_field('sha1sum_s_md', neighbors_to_distances.keys(), classifications) for document in documents: document['smqtk_distance'] = neighbors_to_distances[ document['sha1sum_s_md']] if 'near_duplicates' in params and int(params['near_duplicates']) == 1: documents = [ x for x in documents if x['smqtk_distance'] <= NEAR_DUPLICATES_THRESHOLD ] documents = sorted( documents, key=lambda x: x['smqtk_distance'])[:int(params['n'])] return {'numFound': len(documents), 'docs': documents}
def results(self, params): offset = int(params['offset'] if 'offset' in params else 0) limit = int(params['limit'] if 'limit' in params else 20) resp = requests.get(self.search_url + '/get_results', params={ 'sid': params['sid'], 'i': offset, 'j': offset + limit }).json() # @todo handle errors documents = solr_documents_from_field('sha1sum_s_md', [sha for (sha, _) in resp['results']]) # The documents from Solr (since shas map to >= 1 document) may not be in the order of confidence # returned by IQR, sort the documents to match the confidence values. # Sort by confidence values first, then sha checksums second so duplicate images are grouped together confidenceValues = dict(resp['results']) # Mapping of sha -> confidence values if len(documents) < len(resp['results']): logger.error('SID %s: There are SMQTK descriptors that have no corresponding Solr document(s).' % params['sid']) for document in documents: document['smqtk_iqr_confidence'] = confidenceValues[document['sha1sum_s_md']] return { 'numFound': resp['total_results'], 'docs': sorted(documents, key=lambda x: (x['smqtk_iqr_confidence'], x['sha1sum_s_md']), reverse=True) }
def _search(self, params): assert hasattr(self, 'search_url') classifications = json.loads( params['classifications']) if 'classifications' in params else [] cmu_images = requests.post(self.search_url, data=params['url'], headers={ 'Content-type': 'text', 'Content-length': str(len(params['url'])) }, verify=False).json() cmu_images = [[ image.replace(setting.get('IMAGE_SPACE_CMU_PREFIX'), setting.get('IMAGE_SPACE_SOLR_PREFIX')), score ] for (image, score) in cmu_images] cmu_scores = {image.lower(): score for image, score in cmu_images} documents = solr_documents_from_field('resourcename_t_md', [x[0] for x in cmu_images], classifications) # Augment original scores from response into solr documents for document in documents: document['im_score'] = cmu_scores[document['id'].lower()] return {'numFound': len(documents), 'docs': documents}
def _search(self, params): assert hasattr(self, 'search_url') classifications = json.loads(params['classifications']) if 'classifications' in params else [] cmu_images = requests.post(self.search_url, data=params['url'], headers={ 'Content-type': 'text', 'Content-length': str(len(params['url'])) }, verify=False).json() cmu_images = [[image.replace(os.environ['IMAGE_SPACE_CMU_PREFIX'], os.environ['IMAGE_SPACE_SOLR_PREFIX']), score] for (image, score) in cmu_images] cmu_scores = {image.lower(): score for image, score in cmu_images} documents = solr_documents_from_field('resourcename_t_md', [x[0] for x in cmu_images], classifications) # Augment original scores from response into solr documents for document in documents: document['im_score'] = cmu_scores[document['id'].lower()] return { 'numFound': len(documents), 'docs': documents }
def runImageSimilaritySearch(self, params): assert hasattr(self, 'search_url') classifications = json.loads(params['classifications']) if 'classifications' in params else [] params['n'] = params['n'] if 'n' in params else str(DEFAULT_PAGE_SIZE) smqtk_r = requests.get(self.search_url + '/n=' + params['n'] + '/' + params['url']).json() neighbors_to_distances = dict(zip(smqtk_r['neighbors'], smqtk_r['distances'])) documents = solr_documents_from_field('sha1sum_s_md', neighbors_to_distances.keys(), classifications) for document in documents: document['smqtk_distance'] = neighbors_to_distances[document['sha1sum_s_md']] if 'near_duplicates' in params and int(params['near_duplicates']) == 1: documents = [x for x in documents if x['smqtk_distance'] <= NEAR_DUPLICATES_THRESHOLD] return { 'numFound': len(documents), 'docs': documents }
def results(self, params): offset = int(params['offset'] if 'offset' in params else 0) limit = int(params['limit'] if 'limit' in params else 20) resp = requests.get(self.search_url + '/get_results', params={ 'sid': params['sid'], 'i': offset, 'j': offset + limit }).json() # @todo handle errors documents = solr_documents_from_field( 'sha1sum_s_md', [sha for (sha, _) in resp['results']]) # The documents from Solr (since shas map to >= 1 document) may not be in the order of confidence # returned by IQR, sort the documents to match the confidence values. # Sort by confidence values first, then sha checksums second so duplicate images are grouped together confidenceValues = dict( resp['results']) # Mapping of sha -> confidence values if len(documents) < len(resp['results']): logger.error( 'SID %s: There are SMQTK descriptors that have no corresponding Solr document(s).' % params['sid']) for document in documents: document['smqtk_iqr_confidence'] = confidenceValues[ document['sha1sum_s_md']] return { 'numFound': resp['total_results'], 'docs': sorted(documents, key=lambda x: (x['smqtk_iqr_confidence'], x['sha1sum_s_md']), reverse=True) }
def results(self, params): def sid_exists(sid): """ Determine if a session ID already exists in SMQTK. This currently creates the session if it doesn't already exist. """ return not requests.post(self.search_url + '/session', data={ 'sid': params['sid'] }).ok offset = int(params['offset'] if 'offset' in params else 0) limit = int(params['limit'] if 'limit' in params else 20) if not sid_exists(params['sid']): # Get pos/neg uuids from current session session = self.model('item').findOne({'meta.sid': params['sid']}) if session: self._refine({ 'sid': params['sid'], 'pos_uuids': session['meta']['pos_uuids'], 'neg_uuids': session['meta']['neg_uuids'] }) resp = requests.get(self.search_url + '/get_results', params={ 'sid': params['sid'], 'i': offset, 'j': offset + limit }).json() # @todo handle errors try: documents = solr_documents_from_field( 'sha1sum_s_md', [sha for (sha, _) in resp['results']]) except KeyError: return {'numFound': 0, 'docs': []} # The documents from Solr (since shas map to >= 1 document) may not be in the order of confidence # returned by IQR, sort the documents to match the confidence values. # Sort by confidence values first, then sha checksums second so duplicate images are grouped together confidenceValues = dict( resp['results']) # Mapping of sha -> confidence values if len(documents) < len(resp['results']): logger.error( 'SID %s: There are SMQTK descriptors that have no corresponding Solr document(s).' % params['sid']) for document in documents: document['smqtk_iqr_confidence'] = confidenceValues[ document['sha1sum_s_md']] return { 'numFound': resp['total_results'], 'docs': sorted(documents, key=lambda x: (x['smqtk_iqr_confidence'], x['sha1sum_s_md']), reverse=True) }