def search_results(request): conn = Solr(request.registry.settings['solr_base_url'], decoder=decoder) params = request.GET.copy() q = params.pop('q', None) if q is None: return HTTPFound('http://2012.haip.cc/') params.update({ 'facet': 'true', 'facet.limit': 20, 'facet.mincount': 1, 'facet.sort': 'count', 'facet.field': ['language', 'author_exact', 'year'], 'fl': '*', }) # TODO: get cover data, description from https://developers.google.com/books/docs/v1/reference/volumes # TODO: refactor logic from template to view # TODO: tests # first do request without fq so we get all facet values params_temp = params.copy() if 'fq' in params_temp: del params_temp['fq'] facet_fields = conn.search(q, **params_temp).facets['facet_fields'] # workaround due to limitation that kwargs can't handle multidict if 'fq' in params: params['fq'] = ' AND '.join(params.getall('fq')) log.debug(params) results = conn.search(q, **params) log.debug(results) allowed_networks = request.registry.settings['allowed_networks'].split(',') if request.client_addr in iptools.IpRangeList(*allowed_networks): is_trusted_ip = True else: is_trusted_ip = False out = { 'results': list(results), 'q': q, 'facet_fields': facet_fields, 'facets': params.get('fq', []), } if request.matched_route.name.endswith('json'): return out else: out.update({ 'with_facet': with_facet, 'without_facet': without_facet, 'format_byte_size': format_byte_size, 'format_facet': format_facet, 'is_trusted_ip': is_trusted_ip, }) return out
def check_solr(using='solr'): try: from pysolr import Solr, SolrError except ImportError: raise SkipTest("pysolr not installed.") solr = Solr(settings.HAYSTACK_CONNECTIONS[using]['URL']) try: solr.search('*:*') except SolrError as e: raise SkipTest("solr not running on %r" % settings.HAYSTACK_CONNECTIONS[using]['URL'], e)
def test_custom_results_class(self): solr = Solr('http://localhost:8983/solr/core0', results_cls=dict) results = solr.search(q='*:*') assert isinstance(results, dict) assert 'responseHeader' in results assert 'response' in results
def clear_solr(buid): """Delete all jobs for a given business unit/job source.""" conn = Solr(settings.HAYSTACK_CONNECTIONS["default"]["URL"]) hits = conn.search(q="*:*", rows=1, mlt="false", facet="false").hits logging.info("BUID:%s - SOLR - Deleting all %s jobs" % (buid, hits)) conn.delete(q="buid:%s" % buid) logging.info("BUID:%s - SOLR - All jobs deleted." % buid)
def test_dismax_loc(self): """docstring for test_dismax""" conn = Solr(SOLR_URL) loc_name = 'downing st' loc = TEST_LOCS[loc_name] print '\n\n*** %s ' % loc_name, loc kwords = 'heart' kw = { 'rows': settings.SOLR_ROWS, 'fl': '*,score', 'qt': 'resources', 'sfield': 'pt_location', 'pt': loc, 'bf': 'recip(geodist(),2,200,20)^20', 'sort': 'score desc', } results = conn.search(kwords, **kw) print '\n--\nsearch on [%s] : ' % (kwords) for result in results: print '-', result['score'], result['title'], result.get('pt_location', '')
def handle(self, port="15672", queues="solr,priority", *args, **options): # Determine the number of tasks still in Rabbit msg_count = 0 for queue in queues.split(','): # Get the queue data from the rabbit Management API uri = "http://%(broker)s:%(port)s/api/queues/dseo-vhost/%(queue)s" % {'broker': settings.BROKER_HOST, 'port': port, 'queue': queue} resp = requests.get(uri, auth=(settings.BROKER_USER, settings.BROKER_PASSWORD)) data = json.loads(resp.content) msg_count += data["messages_ready"] msg_count += data["messages_unacknowledged"] # If we find that having a couple long running messages can lead to false positives, # we can change the cutoff here. if msg_count <= self.msg_cutoff: print "No messages in the queue(s), do not raise an alert." return # Determine the number of recently updated jobs in solr. conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL']) start_date = datetime.now() - self.look_behind solr_count = conn.search("date_added:[%s TO NOW]" % (start_date.iso_format() + "Z")).hits if solr_count <= self.solr_cutoff: msg = "Found %s messages in rabbit, but saw %s updated job in the last hour. Perhaps the workers are not responding?" % (msg_count, solr_count) send_mail("Rabbit & Solr Monitoring", msg, "*****@*****.**", "*****@*****.**")
def clear_missing(self, verbose=False): conn = Solr(settings.SOLR_URL) start = 0 to_delete = [] pb = None if verbose: print "Checking for indexed records no longer in database" while True: if verbose and pb: pb.update(start) result = conn.search('*:*', sort='id asc', start=start, rows=500, fields=['id']) if not result: break if verbose and not pb: pb = ProgressBar(result.hits) ids = [int(r['id']) for r in result] records = Record.objects.filter(id__in=ids).values_list('id', flat=True) for r in records: ids.remove(r) to_delete.extend(ids) start += 500 if verbose and pb: pb.done() pb = None if verbose and to_delete: print "Removing unneeded records from index" pb = ProgressBar(len(to_delete)) while to_delete: if verbose and pb: pb.update(pb.total - len(to_delete)) conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete[:500]))) to_delete = to_delete[500:] if verbose and pb: pb.done()
def search(request): query = request.GET.get('q', '') if not query: return dict(query=query) conn = Solr(settings.SOLR_BASE) results = [res for res in [extract_response(r) for r in conn.search(query)] if res is not None] return dict(query=query, responses=results)
def loadsolr(request): collection_id = request.POST.get('collection_id', '') import_classification = request.POST.get('import_classification', '') solr = Solr(settings.CDRS_SOLR_URL) created_count = 0 updated_count = 0 PollenSample.objects.all().delete() options = {'qt': 'forest-data', 'collection_id': collection_id, 'rows': '1000', 'json.nl': 'map' } try: set = "Pollen Types" results = solr.search('import_classifications:("' + import_classification + '" AND "' + set + '")', **options) created, updated = process_pollen_types(results) created_count += created updated_count += updated set = 'Raw Counts of 65 Pollen Types' results = solr.search('import_classifications:("' + import_classification + '" AND "' + set + '")', **options) created, updated = _process_samples(results, "count") created_count += created updated_count += updated set = 'Percentages of 15 Pollen Types' results = solr.search('import_classifications:("' + import_classification + '" AND "' + set + '")', **options) created, updated = _process_samples(results, "percentage") created_count = created_count + created updated_count = updated_count + updated cache.set('solr_created', created_count) cache.set('solr_updated', updated_count) except Exception, e: cache.set('solr_error', str(e))
def _solr_results_chunk(tup, buid, step): """ Takes a (start_index, stop_index) tuple and gets the results in that range from the Solr index. """ conn = Solr(settings.HAYSTACK_CONNECTIONS["default"]["URL"]) results = conn.search( "*:*", fq="buid:%s" % buid, fl="uid", rows=step, start=tup[0], facet="false", mlt="false" ).docs return set([i["uid"] for i in results])
def admin_cdrs_import(request): if (request.method != 'POST'): return render_to_response('portal/admin_cdrs.html', {}) created = 0 updated = 0 solr = Solr(settings.CDRS_SOLR_URL) application = request.POST.get('application', '') collection_id = request.POST.get('collection_id', '') import_classification = request.POST.get('import_classification', '') dt = request.POST.get('last_import_date', '') tm = urllib.unquote(request.POST.get('last_import_time', '00:00')) q = 'import_classifications:"' + import_classification + '"' options = {'qt': 'forest-data'} last_import_date = LastImportDate.get_last_import_date(dt, tm, application) if last_import_date: utc = last_import_date.astimezone(FixedOffset(0)) q += ' AND last_modified:[' + utc.strftime( '%Y-%m-%dT%H:%M:%SZ') + ' TO NOW]' try: collections = urllib.unquote(collection_id).split(",") for c in collections: # Get list of datasets in each collection id record_count = SolrUtilities().get_count_by_lastmodified( c, import_classification, last_import_date) retrieved = 0 while (retrieved < record_count): to_retrieve = min(1000, record_count - retrieved) options['collection_id'] = c options['start'] = str(retrieved) options['rows'] = str(to_retrieve) results = solr.search(q, **options) for result in results: if 'dataset_id' in result: if process_metadata(result): created += 1 else: updated += 1 retrieved = retrieved + to_retrieve # Update the last import date lid = LastImportDate.update_last_import_date(application) cache.set('solr_import_date', lid.strftime('%Y-%m-%d')) cache.set('solr_import_time', lid.strftime('%H:%M:%S')) cache.set('solr_created', created) cache.set('solr_updated', updated) except Exception, e: cache.set('solr_error', str(e))
class InventoryTest(unittest.TestCase): ''' testing solr, using VM core ''' @classmethod def setUpClass(cls): #TODO: all config parsing - to module level setup config = ConfigParser() config.read("config/config.ini") cls.solr_url = config['DEFAULT']['solr_url'] + 'vm' #TODO: figure why logUtils.setup_logging(config) def setUp(self): self.solr = Solr(InventoryTest.solr_url, timeout=2) self.solr.delete(q='*:*') self.inventory = Inventory(InventoryTest.solr_url); super(InventoryTest, self).tearDown() def testAdd(self): self.assertEqual(len(self.solr.search('*:*')), 0) self.inventory.post(VMDATA) self.assertEqual(len(self.solr.search('*:*')), 2) self.assertEqual(len(self.solr.search(VMDATA[0]['name'])), 1) def testVmUpdate(self): self.inventory.post(VMDATA) self.assertEqual(len(self.solr.search('memory:513')), 0) vm_update_data = [{'id':VMDATA[0]['id'], 'fields': [ {'name':'power', 'value':'off', 'command':'set'}, {'name':'memory', 'value':513, 'command':'set'} ] }]; self.inventory.put(vm_update_data); self.assertEqual(len(self.solr.search('memory:513')), 1)
def task_check_solr_count(buid, count): buid = int(buid) conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL']) hits = conn.search(q="buid:%s" % buid, rows=1, mlt="false", facet="false").hits if int(count) != int(hits): logger.warn("For BUID: %s, we expected %s jobs, but have %s jobs", buid, count, hits) send_mail(recipient_list=["*****@*****.**"], from_email="*****@*****.**", subject="Buid Count for %s is incorrect." % buid, message="For BUID: %s, we expected %s jobs, but have %s jobs. Check imports for this buid." % (buid, count, hits), fail_silently=False)
def find_by_place_or_kwords(name, kwords, loc_boost=None, start=0, max=None, accounts=None): """docstring for find_by_place_or_kwords""" conn = Solr(settings.SOLR_URL) if name: return find_by_place(name, kwords, loc_boost, start, max, accounts) # keywords only kw = { 'start': start, 'rows': minmax(0, settings.SOLR_ROWS, max, settings.SOLR_ROWS), 'fl': '*,score', 'qt': 'resources', } return None, conn.search(kwords.strip() or '*:*', **kw)
def search(request): import re from pysolr import Solr from stats.models import DailySearch from settings import SOLR_URL def _fail(query): # phrase is not changed, query is normalized phrase return render_to_response('search_results.html', { 'result': [], 'query': query, 'phrase': query, }, context_instance=RequestContext(request)) phrase = request.GET.get('phrase') try: conn = Solr(SOLR_URL) except: return _fail(phrase) result = [] if not phrase: raise Http404("Malformed request.") q = phrase if phrase.startswith('*') or phrase.startswith('?'): q = phrase[1:] q = q.strip() q = re.sub('['+'\[<>@\]'+']', '', q) q = re.sub('`', '"', q) q = re.sub('\s*:',':', q) q = re.sub('(?<!author)(?<!title)(?<!text)(?<!file)(?<!tag)(?<!artist)(?<!album)(?<!year)(?<!company)(?<!created):', ' ', q) if not q: return _fail(phrase) results = conn.search(q) if not results: return _fail(q) ids = [i['id'] for i in results] result = QuerySetPaginator(Topic.objects.filter(pk__in=ids), RESULTS_ON_PAGE, orphans=5) if result.num_pages == 0: return _fail(q) p = DailySearch.objects.create(phrase=q.strip()) page = request.GET.get('page', 1) try: page = int(page) r = result.page(page) except (InvalidPage, ValueError): raise Http404("No such page") return render_to_response('search_results.html', { 'result': r, 'query': q, 'phrase': phrase, 'page': int(page), 'title': phrase, }, context_instance=RequestContext(request))
def find_by_solr(self, q): """Search solr index for q and return iterable set of items.""" if settings.SOLR['running']: con = Solr(settings.SOLR_URL) solr_results = con.search("(%s) AND class: %s" % (q, 'Item')) # Convert hits back into models and add to the results list results = [] for doc in solr_results.docs: objects = eval(doc['class']).objects.filter(id = doc['id']) if objects.__len__() > 0: results.append(objects[0]) return results else: return []
def test_dismax(self): """docstring for test_dismax""" conn = Solr(SOLR_URL) kwords = 'citizens advice' kw = { 'rows': settings.SOLR_ROWS, 'fl': '*,score', 'qt': 'resources', } results = conn.search(kwords, **kw) print '\n--\nsearch on [%s] : ' % (kwords) for result in results: print '-', result['score'], result['title'] #, result['pt_location']
def find_by_place_or_kwords(name, kwords, loc_boost=None, start=0, max=None, accounts=None, event=None): """docstring for find_by_place_or_kwords""" conn = Solr(settings.SOLR_URL) if name: return find_by_place(name, kwords, loc_boost, start, max, accounts, event) # keywords only kw = { 'start': start, 'rows': minmax(0, settings.SOLR_ROWS, max, settings.SOLR_ROWS), 'fl': '*,score', 'qt': 'resources', } fq = _make_fq(event, accounts) # example 'fq': '(event_start:[NOW/DAY TO *] OR event_end:[NOW/DAY TO *]) AND accounts:4d9b99d889cb16665c000000' if fq: kw['fq'] = fq return None, conn.search(kwords.strip() or '*:*', **kw)
def search(self, q, sort=None, start=None, rows=None, facets=None, facet_limit=-1, facet_mincount=0, fields=None): if not fields: fields = [] if not 'id' in fields: fields.append('id') if not 'presentations' in fields: fields.append('presentations') conn = Solr(settings.SOLR_URL) result = conn.search(q, sort=sort, start=start, rows=rows, facets=facets, facet_limit=facet_limit, facet_mincount=facet_mincount, fields=fields) ids = [int(r['id']) for r in result] records = Record.objects.in_bulk(ids) for r in result: record = records.get(int(r['id'])) presentations = r.get('presentations') if record and presentations: record.solr_presentation_ids = presentations return (result.hits, filter(None, map(lambda i: records.get(i), ids)), result.facets)
def test_postcode(self): conn = Solr(SOLR_URL) aberdeen = 'Ab10 1AX' # peterheid = '57.584806, -1.875630' # keith = '57.7036280142534, -2.85720247750133' print '\n\n*** aberdeen', aberdeen loc = get_place_for_postcode(aberdeen, DB_NAME) print loc srch = '"mental health"' # search(self, q, **kwargs) kw = { 'sfield': 'pt_location', 'pt': lat_lon_to_str(loc['lat_lon']), 'sort': 'geodist() asc', 'fl': '*,score' } # kw = { 'fq':'{!geofilt pt=55.8,-3.10 sfield=store d=50}' } results = conn.search(srch, **kw) print '\n--\nsearch on [%s] : %s' % (srch, loc['lat_lon']) for result in results: print '-', result['score'], result['title'], result['pt_location']
def test_dismax_events(self): """docstring for test_dismax""" conn = Solr(SOLR_URL) kwords = 'dance, music' kw = { 'rows': SOLR_ROWS, 'fl': '*,score', 'qt': 'resources', # 'fq': '(event_start:[NOW/DAY TO *] OR event_end:[NOW/DAY TO *]) AND accounts:4d9b99d889cb16665c000000' 'fq': '(event_start:[NOW/DAY TO *] OR event_end:[NOW/DAY TO *])' } # a_type:2 AND a_begin_date:[1990-01-01T00:00:00.000Z TO 1999-12-31T24:59:99.999Z] # regex to check date formats # and check 1 < 2 results = conn.search(kwords, **kw) print '\n--\nsearch on [%s] : ' % (kwords) for result in results: print '-', result['score'], result['title'], ', ', result.get('event_start', '-'), result.get('event_end', '-')
def find_by_place(name, kwords, loc_boost=None, start=0, max=None, accounts=None, collections=None, event=None, res_type=settings.SOLR_RES): loc = get_or_create_location(name) if loc: kw = { 'start': start, 'rows': minmax(0, settings.SOLR_ROWS, max, settings.SOLR_ROWS), 'fl': '*,score', # 'fq': 'accounts:(4d9c3ced89cb162e5e000000 OR 4d9b99d889cb16665c000000) ', 'qt': 'resources', 'sfield': 'pt_location', 'pt': lat_lon_to_str(loc['lat_lon']), 'bf': 'recip(geodist(),2,200,20)^%s' % (loc_boost or settings.SOLR_LOC_BOOST_DEFAULT), 'sort': 'score desc', } fq = _make_fq(event, accounts, collections, res_type) if fq: kw['fq'] = fq conn = Solr(settings.SOLR_URL) return loc, conn.search(kwords.strip() if kwords else '', **kw) else: return None, None
def find_by_place(name, kwords, loc_boost=None, start=0, max=None, accounts=None): conn = Solr(settings.SOLR_URL) loc = get_place_for_postcode(name) or get_place_for_placename(name) if loc: kw = { 'start': start, 'rows': minmax(0, settings.SOLR_ROWS, max, settings.SOLR_ROWS), 'fl': '*,score', # 'fq': 'accounts:(4d9c3ced89cb162e5e000000 OR 4d9b99d889cb16665c000000) ', 'qt': 'resources', 'sfield': 'pt_location', 'pt': lat_lon_to_str(loc['lat_lon']), 'bf': 'recip(geodist(),2,200,20)^%s' % (loc_boost or settings.SOLR_LOC_BOOST_DEFAULT), 'sort': 'score desc', } if accounts: kw['fq'] = 'accounts:(%s)'% ' OR '.join(accounts) return loc['lat_lon'], conn.search(kwords.strip() or '*:*', **kw) else: return None, None
def test_test(self): # print 'starting solr test' conn = Solr(SOLR_URL) # self._rebuild_index(conn) ellon = '57.365287, -2.070642' peterheid = '57.584806, -1.875630' keith = '57.7036280142534, -2.85720247750133' loc = keith print '\n\n*** keith ', loc srch = '"mental health"' # search(self, q, **kwargs) kw = { 'sfield': 'pt_location', 'pt': loc, 'sort': 'geodist() asc', 'fl': '*,score' } # kw = { 'fq':'{!geofilt pt=55.8,-3.10 sfield=store d=50}' } results = conn.search(srch, **kw) print '\n--\nsearch on [%s] : %s' % (srch, loc) for result in results: print '-', result['res_id'], result['score'], result['title'], result['pt_location']
def get_solr_tagcloud(language='ru'): tag_info = {} try: solr = Solr(get_config('HAYSTACK_SOLR_URL', 'http://127.0.0.1:8983/solr')) kwargs = { 'facet': 'on', 'facet.field': 'tags', 'rows': 0, 'start': 1 } result = solr.search(q = 'language:%s' % language, **kwargs) tags = result.facets['facet_fields']['tags'] tags_len = len(tags) for i, item in enumerate(tags): if not i%2: if i < tags_len-1: tag_info.update({ item: {'count': tags[i+1], 'font': None }}) except: pass return tag_info
def get_count_by_lastmodified(self, collection_id, import_classification, last_import_date): solr_conn = Solr(settings.CDRS_SOLR_URL) record_count = 0 options = { 'qt': 'forest-data', 'facet': 'true', 'facet.field': 'import_classifications', 'facet.mincount': '1', 'rows': '0', 'fq': 'import_classifications:"' + import_classification + '"', 'json.nl': 'map' } if last_import_date: utc = last_import_date.astimezone(FixedOffset(0)) options['fq'] += ' AND last_modified:[' + \ utc.strftime('%Y-%m-%dT%H:%M:%SZ') + ' TO NOW]' import_classification = unquote(import_classification) collections = unquote(collection_id).split(",") for c in collections: # Get list of datasets in each collection id options['collection_id'] = c results = solr_conn.search('*:*', **options) facets = results.facets["facet_fields"]["import_classifications"] for key, value in list(facets.items()): if key == import_classification: record_count += value break return record_count
class JobFeedTestCase(TestCase): def setUp(self): super(JobFeedTestCase, self).setUp() self.businessunit = BusinessUnitFactory.build() self.businessunit.save() self.buid_id = self.businessunit.id self.numjobs = 4 self.testdir = os.path.abspath(os.path.dirname(__file__)) self.conn = Solr("http://127.0.0.1:8983/solr/") self.emptyfeed = os.path.join(self.testdir, "dseo_feed_0.no_jobs.xml") #Ensures DATA_DIR used by import_jobs.download_feed_file exists data_path = settings.DATA_DIR if not os.path.exists(data_path): os.mkdir(data_path) def test_dev2_feed(self): filepath = import_jobs.download_feed_file(self.buid_id) results = xmlparse.DEv2JobFeed(filepath) jobs = results.jobparse() self.assertEqual(results.jsid, self.buid_id) self.assertEqual(results.company, self.businessunit.title) self.assertEqual(len(jobs), self.numjobs) # Test for the presence of every non-calculated field on the jobListing # model. (That is, all slugfields and 'location' are left out.) self.assertEqual(set(jobs[0].keys()), set(['buid_id', 'city', 'country', 'country_short', 'date_new', 'date_updated', 'description', 'hitkey', 'link', 'onet_id', 'reqid', 'state', 'state_short', 'title', 'uid', 'zipcode'])) def test_mocids(self): """ Tests that mocid fields exist when jobs are imported from a feed and added to a solr connnection """ filepath = import_jobs.download_feed_file(self.buid_id) results = xmlparse.DEv2JobFeed(filepath) jobs = results.solr_jobs() # Since we're going to be adding/updating data in the Solr index, we're # hardcoding in the local Solr instance so that we don't accidentally # alter production data. self.conn.add(jobs) num_hits = self.conn.search(q="*:*", fq="buid:%s -mocid:[* TO *]" % self.buid_id) self.assertEqual(num_hits.hits, self.numjobs) for job in jobs: self.assertTrue('mocid' in job) def test_empty_feed(self): """ Test that the schema for the v2 DirectEmployers feed file schema allows for empty feed files. """ results = xmlparse.DEv2JobFeed(self.emptyfeed) # If the schema is such that empty feed files are considered invalid, # trying to run jobparse() will throw an exception. self.assertEqual(len(results.jobparse()), 0) def test_empty_solr(self): """ Tests for the proper behavior when encountering a job-less, but otherwise valid, feed file. The proper behavior is to delete any jobs associated with that BusinessUnit from the Solr index. """ # Normal download-and-parse operation on a feed file with jobs. import_jobs.update_solr(self.buid_id) results = self.conn.search(q="*:*", fq="buid:%s" % self.buid_id) self.assertEqual(results.hits, self.numjobs) # Download-and-parse operation on a feed file with no jobs. Expected # behavior is to delete all jobs. self._get_feedfile() import_jobs.update_solr(self.buid_id, download=False) results = self.conn.search(q="*:*", fq="buid:%s" % self.buid_id) self.assertEqual(results.hits, 0) def test_empty_db(self): """ Tests for the proper behavior when encountering a job-less, but otherwise valid, feed file. The proper behavior is to delete any jobs associated with that BusinessUnit from the database. """ # Normal download-and-parse operation on a feed file with jobs. import_jobs.refresh_bunit_jobs(self.buid_id) dbjobs = jobListing.objects.filter(buid=self.buid_id).count() self.assertEqual(dbjobs, self.numjobs) # Download-and-parse operation on a feed file with no jobs. Expected # behavior is to delete all jobs. self._get_feedfile() import_jobs.refresh_bunit_jobs(self.buid_id, download=False) dbjobs = jobListing.objects.filter(buid=self.buid_id).count() self.assertEqual(dbjobs, 0) def test_zipcode(self): """ Tests to ensure proper behavior of zipcode field in being entered both in the database and Solr. """ filepath = import_jobs.download_feed_file(self.buid_id) dbresults = xmlparse.DEv2JobFeed(filepath) solrresults = dbresults.solr_jobs() zips_from_feedfile = ["28243", "10095", "90212", "30309"] solrzips = [i['zipcode'] for i in solrresults] dbzips = [i['zipcode'] for i in dbresults.jobparse()] for coll in [solrzips, dbzips]: self.assertItemsEqual(zips_from_feedfile, coll) def test_salt_date(self): """ Test to ensure that job postings show up in a quasi-random fashion by sorting by the `salted_date` attribute in the index vice strictly by `date_new`. """ filepath = import_jobs.download_feed_file(self.buid_id) jobs = xmlparse.DEv2JobFeed(filepath) solrjobs = jobs.solr_jobs() self.conn.add(solrjobs) results = self.conn.search(q="*:*", sort="salted_date asc") self.assertEqual(self.numjobs, results.hits) # We can't really test for inequality between the two result sets, # since sometimes results.docs will equal results2.docs. results2 = self.conn.search(q="*:*", sort="date_new asc") self.assertItemsEqual(results2.docs, results.docs) def test_date_updated(self): """ Test to ensure proper behavior of date updated field when added to Solr. """ filepath = import_jobs.download_feed_file(self.buid_id) jobs = xmlparse.DEv2JobFeed(filepath) solrjobs = jobs.solr_jobs() self.conn.add(solrjobs) date_updated = datetime.datetime.strptime("5/17/2012 12:01:05 PM", "%m/%d/%Y %I:%M:%S %p") solr_dates = [i['date_updated'] for i in solrjobs] for solr_date in solr_dates: self.assertEqual(solr_date, date_updated) def _get_feedfile(self): # Download the 'real' feed file then copy the empty feed file in its # place. realfeed = import_jobs.download_feed_file(self.buid_id) shutil.copyfile(realfeed, "%s.bak" % realfeed) shutil.copyfile(self.emptyfeed, realfeed)
class SearchBackend(BaseSearchBackend): # Word reserved by Solr for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Solr for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', ) def __init__(self, site=None): super(SearchBackend, self).__init__(site) if not hasattr(settings, 'HAYSTACK_SOLR_URL'): raise ImproperlyConfigured('You must specify a HAYSTACK_SOLR_URL in your settings.') timeout = getattr(settings, 'HAYSTACK_SOLR_TIMEOUT', 10) self.conn = Solr(settings.HAYSTACK_SOLR_URL, timeout=timeout) def update(self, index, iterable, commit=True): docs = [] try: for obj in iterable: doc = {} doc['id'] = self.get_identifier(obj) doc['django_ct'] = "%s.%s" % (obj._meta.app_label, obj._meta.module_name) doc['django_id'] = force_unicode(obj.pk) doc.update(index.prepare(obj)) docs.append(doc) except UnicodeDecodeError: sys.stderr.write("Chunk failed.\n") self.conn.add(docs, commit=commit) def remove(self, obj_or_string, commit=True): solr_id = self.get_identifier(obj_or_string) self.conn.delete(id=solr_id, commit=commit) def clear(self, models=[], commit=True): if not models: # *:* matches all docs in Solr self.conn.delete(q='*:*', commit=commit) else: models_to_delete = [] for model in models: models_to_delete.append("django_ct:%s.%s" % (model._meta.app_label, model._meta.module_name)) self.conn.delete(q=" OR ".join(models_to_delete), commit=commit) # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99 self.conn.optimize() def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, **kwargs): if len(query_string) == 0: return [] kwargs = { 'fl': '* score', } if fields: kwargs['fl'] = fields if sort_by is not None: kwargs['sort'] = sort_by if start_offset is not None: kwargs['start'] = start_offset if end_offset is not None: kwargs['rows'] = end_offset if highlight is True: kwargs['hl'] = 'true' kwargs['hl.fragsize'] = '200' if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: kwargs['spellcheck'] = 'true' kwargs['spellcheck.collate'] = 'true' kwargs['spellcheck.count'] = 1 if facets is not None: kwargs['facet'] = 'on' kwargs['facet.field'] = facets if date_facets is not None: kwargs['facet'] = 'on' kwargs['facet.date'] = date_facets.keys() for key, value in date_facets.items(): # Date-based facets in Solr kinda suck. kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date')) kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date')) kwargs["f.%s.facet.date.gap" % key] = value.get('gap') if query_facets is not None: kwargs['facet'] = 'on' kwargs['facet.query'] = ["%s:%s" % (field, value) for field, value in query_facets.items()] if narrow_queries is not None: kwargs['fq'] = list(narrow_queries) raw_results = self.conn.search(query_string, **kwargs) return self._process_results(raw_results, highlight=highlight) def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, **kwargs): index = self.site.get_index(model_instance.__class__) field_name = index.get_content_field() params = { 'fl': '*,score', } if start_offset is not None: params['start'] = start_offset if end_offset is not None: params['rows'] = end_offset if additional_query_string: params['fq'] = additional_query_string raw_results = self.conn.more_like_this("id:%s" % self.get_identifier(model_instance), field_name, **params) return self._process_results(raw_results) def _process_results(self, raw_results, highlight=False): from haystack import site results = [] hits = raw_results.hits facets = {} spelling_suggestion = None if hasattr(raw_results, 'facets'): facets = { 'fields': raw_results.facets.get('facet_fields', {}), 'dates': raw_results.facets.get('facet_dates', {}), 'queries': raw_results.facets.get('facet_queries', {}), } for key in ['fields']: for facet_field in facets[key]: # Convert to a two-tuple, as Solr's json format returns a list of # pairs. facets[key][facet_field] = zip(facets[key][facet_field][::2], facets[key][facet_field][1::2]) if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: if hasattr(raw_results, 'spellcheck'): if len(raw_results.spellcheck.get('suggestions', [])): # For some reason, it's an array of pairs. Pull off the # collated result from the end. spelling_suggestion = raw_results.spellcheck.get('suggestions')[-1] indexed_models = site.get_indexed_models() for raw_result in raw_results.docs: app_label, model_name = raw_result['django_ct'].split('.') additional_fields = {} for key, value in raw_result.items(): additional_fields[str(key)] = self.conn._to_python(value) del(additional_fields['django_ct']) del(additional_fields['django_id']) del(additional_fields['score']) if raw_result['id'] in getattr(raw_results, 'highlighting', {}): additional_fields['highlighted'] = raw_results.highlighting[raw_result['id']] model = get_model(app_label, model_name) if model: if model in indexed_models: result = SearchResult(app_label, model_name, raw_result['django_id'], raw_result['score'], **additional_fields) results.append(result) else: hits -= 1 else: hits -= 1 return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
class Indexer(object): """Indexer for PRL.""" def __init__(self, args: Dict[str, Any]): self.solr = None self.s3 = None self.harvester_settings = None self.record_sets = None self.args = args self.oai_pmh_cache = {} def connect(self): """Initializes the interfaces for all third-party services.""" self._connect_internal_services() if not self.args['dry_run']: self._connect_external_services() def _connect_internal_services(self): """Initializes the interfaces for all third-party services instantiated by this module.""" try: self.harvester_settings = plyvel.DB(os.path.expanduser( os.environ.get('LEVELDB_HARVESTER_SETTINGS_DIRECTORY')), create_if_missing=True) self.record_sets = plyvel.DB(os.path.expanduser( os.environ.get('LEVELDB_RECORD_SETS_DIRECTORY')), create_if_missing=True) self.set_harvester_settings() except plyvel.IOError as e: raise IndexerError( 'Failed to instantiate LevelDB instance: {}'.format(repr(e))) def _connect_external_services(self): """Initializes the interfaces for all third-party services NOT instantiated by this module.""" try: solr_base_url = 'http://{}:{}/solr/{}'.format( os.environ.get('SOLR_HOST'), os.environ.get('SOLR_PORT'), os.environ.get('SOLR_CORE_NAME')) # Make sure we can connect to Solr. def solr_ping(base_url): """Raises an error if we can't connect to Solr.""" o = urllib.parse.urlsplit(solr_base_url) ping_url = urllib.parse.urlunsplit( o[:2] + (os.path.join(o.path, 'admin/ping'), ) + o[3:]) requests.get(ping_url).raise_for_status() solr_ping(solr_base_url) self.solr = Solr(solr_base_url, always_commit=True) self.s3 = boto3.Session( aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'), aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'), region_name=os.environ.get('AWS_DEFAULT_REGION')).client('s3') except requests.exceptions.RequestException as e: raise IndexerError('Connection failed: {}'.format(e)) except BotoCoreError as e: raise IndexerError('Failed to initialize S3 session: {}'.format( repr(e))) def disconnect(self): """Closes connections with all third-party services.""" self._disconnect_internal_services() if not self.args['dry_run']: self._disconnect_external_services() def _disconnect_internal_services(self): """Closes connections with all third-party services instantiated by this module.""" try: self.harvester_settings.close() self.record_sets.close() except plyvel.Error as e: raise IndexerError( 'Failed to close the connection to LevelDB: {}'.format(e)) def _disconnect_external_services(self): """Closes connections with all third-party services NOT instantiated by this module.""" self.solr = None self.s3 = None def get_harvester_settings_path(self) -> str: """Gets the full path of the file containing jOAI harvester settings.""" return os.path.join( os.path.expanduser( os.environ.get('JOAI_HARVESTER_SETTINGS_DIRECTORY')), JOAI_SCHEDULED_HARVESTS_FILENAME) def get_harvester_settings_key(self, path: str) -> str: """ Returns a relative path with either one or two components. Intended to be called ONLY on paths representing institution/repository or collection/set directories. """ harvest_dir_prefix = os.environ.get('JOAI_DATA_DIRECTORY') return os.path.relpath(path, harvest_dir_prefix) def read_harvester_settings_file(self, path: str) -> Dict[str, Dict[str, str]]: """Returns a dictionary representing the harvester settings. First, tries reading the settings as if the source file is UTF-8 encoded JSON of the following form (used for testing): { "harvester_settings_key_1": { "repository_name": "repository_name_1", "base_url": "http://example.edu/oai2", "set_spec": "set_spec_1", "split_by_set": False }, ... } If that fails, tries reading the settings as if the source file is a serialized java.util.Hashtable instance from jOAI (used for production). """ try: # See if it's in JSON already. with open(path, 'r') as harvester_settings_file: # Make sure we transform the key before storing. return { self.get_harvester_settings_key(key): metadata for key, metadata in json.load( harvester_settings_file).items() } except JSONDecodeError as e: # Invalid JSON. raise IndexerError( 'Cannot load scheduled harvests settings: {}'.format(e)) except FileNotFoundError as e: # This file won't exist when no harvests have been scheduled, so it's probably fine. logging.debug( 'Scheduled harvests settings file does not exist: {}'.format( path)) return {} except UnicodeDecodeError as e: logging.debug('Config file is not JSON: {}'.format(e)) # Open the file in binary mode and try to parse it with javaobj. with open(path, 'rb') as harvester_settings_file: pobj = javaobj.loads(harvester_settings_file.read()) is_scheduled_harvest = lambda h: JOAI_SCHEDULED_HARVEST_CLASSNAME in str( h) return { self.get_harvester_settings_key(pobj_harvest.harvestDir.path): { 'repository_name': pobj_harvest.repositoryName, 'base_url': pobj_harvest.baseURL, 'set_spec': pobj_harvest.setSpec, 'split_by_set': pobj_harvest.splitBySet } for pobj_harvest in list( filter(is_scheduled_harvest, pobj.annotations)) } except Exception as e: # Something else went wrong. raise IndexerError( 'Cannot load scheduled harvests settings: {}'.format(e)) def set_harvester_settings(self): """Updates the harvester_settings LevelDB instance with the data stored in the source file. Responds to filesystem event on that file. """ harvester_settings_path = self.get_harvester_settings_path() new_harvester_settings = self.read_harvester_settings_file( harvester_settings_path) deleted_keys = [] updated_keys = [] # Remove all keys from LevelDB that aren't in the harvester settings file. harvester_settings_iterator = self.harvester_settings.iterator() for key, value in harvester_settings_iterator: if key.decode() not in new_harvester_settings: self.harvester_settings.delete(key) deleted_keys.append(key) if deleted_keys: logging.info('Deleted harvester settings for %s', deleted_keys) # Add all keys in the harvester settings file to LevelDB, since some of their values may have changed. for harvest_key, harvest_metadata in new_harvester_settings.items(): key = harvest_key value = json.dumps(harvest_metadata) self.harvester_settings.put(key.encode(), value.encode()) updated_keys.append(key) if updated_keys: logging.info('Updated harvester settings for %s', updated_keys) def update_record(self, path: str): """Updates a metadata record in PRL. Responds to IndexerEventHandler.on_modified filesystem event. """ if not self.args['dry_run']: record_metadata = self.get_key_record_metadata(path) record_identifier = record_metadata[0] record_sets_serialized_encoded = self.record_sets.get( record_identifier.encode()) # Generate a Solr document from the metadata record. with open(path, 'r', encoding='utf-8') as record_file: prl_solr_document = self.get_solr_document(record_file) # If there is a thumbnail, save it to the system. if prl_solr_document.original_thumbnail_metadata(): self.save_thumbnail(prl_solr_document) record_identifier = prl_solr_document.id # Determine whether or not this is a create or an update. if record_sets_serialized_encoded is None: action = 'create' else: action = 'update' # If we've processed this record in the past, make sure we don't completely overwrite the collectionKey or collectionName fields. # We save these locally in LevelDB. record_sets = json.loads( record_sets_serialized_encoded.decode()) prl_solr_document.complete_collection_list( record_sets['collectionKey'], record_sets['collectionName']) pysolr_doc = prl_solr_document.get_pysolr_doc() collection_key = pysolr_doc['collectionKey'] collection_name = pysolr_doc['collectionName'] try: self.solr.add([pysolr_doc], overwrite=True) logging.debug('%s %sd in Solr', record_identifier, action) self.record_sets.put( record_identifier.encode(), json.dumps({ 'collectionKey': collection_key, 'collectionName': collection_name }).encode()) logging.info('%s %sd in PRL', record_identifier, action) except plyvel.Error as e: self.solr.delete(id=record_identifier) raise IndexerError('Failed to PUT on LevelDB: {}'.format(e)) except Exception as e: raise IndexerError( 'Failed to update Solr document: {}'.format(e)) else: logging.info('DRY-RUN: %s updated in PRL', record_identifier) def remove_record(self, path: str): """Removes a metadata record from PRL. Responds to IndexerEventHandler.on_deleted filesystem event. """ if not self.args['dry_run']: try: record_metadata = self.get_key_record_metadata(path) record_identifier = record_metadata[0] # We're certain that our serialized JSON is valid. record_sets = json.loads( self.record_sets.get(record_identifier.encode()).decode()) except plyvel.Error as e: raise IndexerError('Failed to GET on LevelDB: {}'.format(e)) # Either remove the record from the system, or update it. if len(record_sets['collectionKey']) == 1: # Remove the thumbnail if there is one. try: pysolr_doc = self.solr.search( 'id:"{0}"'.format(record_identifier)).docs[0] except Exception as e: raise IndexerError('Failed to GET {} from Solr: {}'.format( record_identifier, e)) if 'thumbnail_url' in pysolr_doc: self.unsave_thumbnail(pysolr_doc['thumbnail_url'], record_identifier) # Remove the document from Solr. try: self.solr.delete(id=record_identifier) except Exception as e: raise IndexerError( 'Failed to DELETE {} from Solr: {}'.format( record_identifier, e)) logging.debug('%s removed from Solr', record_identifier) try: self.record_sets.delete(record_identifier.encode()) except plyvel.Error as e: raise IndexerError( 'Failed to DELETE on LevelDB: {}'.format(e)) logging.info('%s removed from PRL', record_identifier) else: # Update the list of collections that the record belongs to. # This is the case when a record belongs to more than one OAI-PMH set. collection_key = list( filter(lambda x: x != record_metadata[3], record_sets['collectionKey'])) collection_name = list( filter(lambda x: x != record_metadata[4], record_sets['collectionName'])) pysolr_doc = { 'id': record_identifier, 'collectionKey': collection_key, 'collectionName': collection_name } try: self.solr.add([pysolr_doc], fieldUpdates={ 'collectionKey': 'set', 'collectionName': 'set' }, overwrite=True) except Exception as e: raise IndexerError('Failed to POST {} on Solr: {}'.format( record_identifier, e)) logging.debug( '%s updated in Solr (removed from collection %s)', record_identifier, record_metadata[3]) try: self.record_sets.put( record_identifier.encode(), json.dumps({ 'collectionKey': collection_key, 'collectionName': collection_name }).encode()) except plyvel.Error as e: raise IndexerError( 'Failed to PUT on LevelDB: {}'.format(e)) logging.info('%s updated in PRL (removed from collection %s)', record_identifier, record_metadata[3]) else: logging.info('DRY-RUN: Removed %s', path) def get_oai_pmh_metadata(self, base_url: str) -> Dict[str, str]: """Returns a dictionary containing top-level metadata and set metadata of an OAI-PMH repository.""" logging.debug( 'Retrieving repository and set metadata from OAI-PMH repository %s', base_url) try: metadata = {} # All repositories should have this metadata. repository_metadata = Sickle(base_url, timeout=60).Identify() if hasattr(repository_metadata, 'repositoryIdentifier'): metadata[ 'repository_identifier'] = repository_metadata.repositoryIdentifier if hasattr(repository_metadata, 'repositoryName'): metadata[ 'repository_name'] = repository_metadata.repositoryName # Not all repositories will support sets. try: set_metadata = Sickle(base_url, timeout=60).ListSets() metadata.update({ 'sets': {s.setSpec: s.setName for s in list(set_metadata)} }) except sickle.oaiexceptions.NoSetHierarchy as e: logging.debug( 'Failed to list sets from OAI-PMH repository %s: %s', base_url, e) return metadata except requests.RequestException as e: raise IndexerError( 'Failed to get repository metadata from OAI-PMH repository {}: {}' .format(base_url, e)) def get_solr_document(self, file_object: TextIOWrapper) -> PRLSolrDocument: """Builds a Solr document for PRL.""" identifier, institution_key, institution_name, collection_key, collection_name = self.get_key_record_metadata( file_object.name) if self.args['dry_run']: s3_domain_name = 'example.com' else: s3_domain_name = os.environ.get('AWS_S3_BUCKET_DOMAIN_NAME') return PRLSolrDocument(file_object, identifier, institution_key, institution_name, collection_key, collection_name, s3_domain_name) def get_key_record_metadata(self, file_path: str): """Determines collection and institution metadata from the filepath of the record. Returns a 5-tuple containing the following elements: - an identifier for the record - an identifier for the institution - a human-readable string for the institution - an identifier for the collection - a human-readable string for the collection Side effects: - updates local LevelDB cache with OAI-PMH repository metadata """ # ---------------------------------------- # # --- Gather all the data we can find. --- # # ---------------------------------------- # # Get the record identifier from the filename. identifier = urllib.parse.unquote( os.path.splitext(os.path.basename(file_path))[0]) try: # The harvester settings will tell us how to get the other metadata. harvester_settings_key = None potential_harvester_settings_keys = map( self.get_harvester_settings_key, [ os.path.dirname(file_path), os.path.dirname(os.path.dirname(file_path)) ]) # Keep track of keys that we tried, but failed. tried_keys = [] for potential_harvester_settings_key in potential_harvester_settings_keys: potential_harvester_settings_serialized_encoded = self.harvester_settings.get( potential_harvester_settings_key.encode()) if potential_harvester_settings_serialized_encoded: # Found it! harvester_settings_key = potential_harvester_settings_key break else: tried_keys.append(potential_harvester_settings_key) if harvester_settings_key is not None: harvester_settings_serialized_encoded = potential_harvester_settings_serialized_encoded harvester_settings_serialized = harvester_settings_serialized_encoded.decode( ) harvester_settings = json.loads(harvester_settings_serialized) else: # This should never happen. Harvester settings should represent all harvested files. raise IndexerError( 'Cannot find harvester settings in LevelDB for {}'.format( tried_keys)) except plyvel.Error as e: # We can't go on without LevelDB. raise IndexerError('Failed to GET on LevelDB: {}'.format(e)) except AttributeError as e: # This should never happen. Harvester settings should represent all harvested files. raise IndexerError( 'Cannot find harvester settings in LevelDB for {}'.format( harvester_settings_key)) except JSONDecodeError as e: # This should never happen. raise IndexerError( 'Harvester settings are not valid JSON: {}'.format(e)) base_url = harvester_settings['base_url'] institution_name = harvester_settings['repository_name'] set_spec = harvester_settings['set_spec'] split_by_set = harvester_settings['split_by_set'] # Fetch repository metadata, and write to the in-memory cache if necessary. if base_url in self.oai_pmh_cache: oai_pmh_metadata = self.oai_pmh_cache[base_url] else: oai_pmh_metadata = self.get_oai_pmh_metadata(base_url) self.oai_pmh_cache[base_url] = oai_pmh_metadata # ----------------------------------------- # # --- Determine which values to return. --- # # ----------------------------------------- # # This is the most common case: an institution specifies a specific set for us to harvest. individual_set_harvest = set_spec != '' and not split_by_set # This is the case when an institution wants us to harvest all sets from their repository. full_repository_harvest = set_spec == '' and split_by_set # This is the case when an institution wants us to treat their entire repository as a PRL "collection". single_collection_repository = set_spec == '' and not split_by_set # Set the return values. if individual_set_harvest: institution_key = os.path.dirname(harvester_settings_key) collection_key = set_spec collection_name = oai_pmh_metadata['sets'][set_spec] elif full_repository_harvest: institution_key = harvester_settings_key collection_key = os.path.basename(os.path.dirname(file_path)) collection_name = oai_pmh_metadata['sets'][collection_key] elif single_collection_repository: institution_key = os.path.dirname(harvester_settings_key) collection_key = os.path.basename(harvester_settings_key) collection_name = oai_pmh_metadata['repository_name'] else: raise IndexerError( 'Unable to handle harvest configuration: {}'.format( harvester_settings_key)) return (identifier, institution_key, institution_name, collection_key, collection_name) def save_thumbnail(self, prl_solr_document: PRLSolrDocument): """Puts thumbnail on the local filesystem and on S3. Returns the Boolean value of whether or not a thumbnail was saved.""" thumbnail_path = self.download_thumbnail(prl_solr_document) if thumbnail_path: self.upload_thumbnail(prl_solr_document, thumbnail_path) logging.debug('%s thumbnail saved', prl_solr_document.get_record_identifier()) return True else: return False def download_thumbnail(self, prl_solr_document: PRLSolrDocument): """Puts the thumbnail file in its place on the file system. Returns its path, or None if no thumbnail could be fetched.""" # TODO: need better exception handling here thumbnail_s3_key = prl_solr_document.get_thumbnail_s3_key() try: filepath = os.path.join( os.path.abspath(os.environ.get('THUMBNAILS_DIRECTORY')), thumbnail_s3_key) os.makedirs(os.path.dirname(filepath), exist_ok=True) original_thumbnail_url = prl_solr_document.original_thumbnail_metadata( )['url'] n_tries = 3 for try_i in range(1, n_tries + 1): try: response = requests.get(original_thumbnail_url, timeout=30, stream=True) # Fail on 4xx or 5xx response.raise_for_status() # Make sure the Content-Type is what we expect and that the server doesn't disallow robots response_content_type = response.headers.get( 'Content-Type') if re.match(re.compile('image/.+'), response_content_type): with open(filepath, 'wb') as image_file: for chunk in response.iter_content( chunk_size=1024): image_file.write(chunk) logging.debug( '%s thumbnail put on local filesystem at %s', thumbnail_s3_key, filepath) if not prl_solr_document.has_thumbnail_format(): # Determine the format and rename the image file to use the newly-determined filetype ext prl_solr_document.set_thumbnail_format( response_content_type) new_filepath = os.path.join( os.path.abspath( os.path.expanduser( os.environ.get( 'THUMBNAILS_DIRECTORY'))), prl_solr_document.get_thumbnail_s3_key()) logging.debug('renaming %s -> %s', filepath, new_filepath) os.rename(filepath, new_filepath) return new_filepath else: return filepath else: logging.debug('Robots cannot access %s', original_thumbnail_url) return None except requests.Timeout as e: if try_i < n_tries: msg = 'Thumbnail download timed out, retrying...' logging.info(msg) # Continue loop else: # No more tries left, so fail msg = 'Failed to download thumbnail after {} tries: {}'.format( n_tries, str(e)) logging.debug(msg) return None except (requests.RequestException, IOError) as e: msg = 'Failed to download thumbnail: {}'.format(e) logging.debug(msg) return None except Exception as e: raise IndexerError( 'Failed to put thumbnail on local filesystem: {}'.format(e)) def upload_thumbnail(self, prl_solr_document: PRLSolrDocument, filepath: str): """Puts the thumbnail on S3.""" # Determine a URL for the thumbnail now that we've downloaded it and know the image format prl_solr_document.add_thumbnail_url() try: self.s3.put_object( Bucket=os.environ.get('AWS_S3_BUCKET_NAME'), Key=prl_solr_document.get_thumbnail_s3_key(), Body=open(filepath, 'rb'), ContentType=prl_solr_document.original_thumbnail_metadata() ['content-type']) logging.debug('%s thumbnail put on S3', prl_solr_document.get_record_identifier()) except BotoCoreError as e: raise IndexerError('Failed to put thumbnail on S3: {}'.format( e.msg)) def unsave_thumbnail(self, thumbnail_url: str, record_identifier: str): """Removes thumbnail from the local filesystem and from S3.""" try: thumbnail_s3_key = os.path.relpath( urllib.parse.urlparse( urllib.parse.unquote(thumbnail_url)).path, '/') filepath = os.path.join( os.path.abspath(os.environ.get('THUMBNAILS_DIRECTORY')), thumbnail_s3_key) os.remove(filepath) logging.debug('%s thumbnail removed from local filesystem at %s', record_identifier, filepath) # TODO: clean up empty parent directories self.s3.delete_object(Bucket=os.environ.get('AWS_S3_BUCKET_NAME'), Key=thumbnail_s3_key) logging.debug('%s thumbnail removed from S3', record_identifier) except BotoCoreError as e: raise IndexerError('Failed to remove thumbnail from S3: {}'.format( e.msg)) except Exception as e: raise IndexerError( 'Failed to remove thumbnail from local filesystem: {}'.format( e))
def search_results(request): conn = Solr(request.registry.settings['solr_base_url'], decoder=decoder) params = request.GET.copy() q = params.pop('q', None) if q is None: return HTTPFound('http://2012.haip.cc/') params.update({ 'facet': 'true', 'facet.limit': 20, 'facet.mincount': 1, 'facet.sort': 'count', 'facet.field': ['language', 'author_exact', 'year'], 'fl': '*', }) # TODO: get cover data, description from # https://developers.google.com/books/docs/v1/reference/volumes # TODO: refactor logic from template to view # TODO: tests # first do request without fq so we get all facet values params_temp = params.copy() if 'fq' in params_temp: del params_temp['fq'] facet_fields = conn.search(q, **params_temp).facets['facet_fields'] # workaround due to limitation that kwargs can't handle multidict if 'fq' in params: params['fq'] = ' AND '.join(params.getall('fq')) log.debug(params) results = conn.search(q, **params) log.debug(results) allowed_networks = request.registry.settings['allowed_networks'].split(',') if request.client_addr.startswith('::ffff:'): ip = request.client_addr[len('::ffff:'):] else: ip = request.client_addr if ip in iptools.IpRangeList(*allowed_networks): is_trusted_ip = True else: is_trusted_ip = False out = { 'results': list(results), 'q': q, 'facet_fields': facet_fields, 'facets': params.get('fq', []), } if request.matched_route.name.endswith('json'): return out else: out.update({ 'with_facet': with_facet, 'without_facet': without_facet, 'format_byte_size': format_byte_size, 'format_facet': format_facet, 'is_trusted_ip': is_trusted_ip, }) return out
class DocManager: """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", **kwargs): """Verify Solr URL and establish a connection. """ self.solr = Solr(url) self.unique_key = unique_key # pysolr does things in milliseconds if auto_commit_interval is not None: self.auto_commit_interval = auto_commit_interval * 1000 else: self.auto_commit_interval = None self.field_list = [] self._build_fields() def _parse_fields(self, result, field_name): """ If Schema access, parse fields and build respective lists """ field_list = [] for key, value in result.get("schema", {}).get(field_name, {}).items(): if key not in field_list: field_list.append(key) return field_list def _build_fields(self): """ Builds a list of valid fields """ declared_fields = self.solr._send_request("get", ADMIN_URL) result = decoder.decode(declared_fields) self.field_list = self._parse_fields(result, "fields") # Build regular expressions to match dynamic fields. # dynamic field names may have exactly one wildcard, either at # the beginning or the end of the name self._dynamic_field_regexes = [] for wc_pattern in self._parse_fields(result, "dynamicFields"): if wc_pattern[0] == "*": self._dynamic_field_regexes.append(re.compile("\w%s\Z" % wc_pattern)) elif wc_pattern[-1] == "*": self._dynamic_field_regexes.append(re.compile("\A%s\w*" % wc_pattern[:-1])) def _clean_doc(self, doc): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key def flattened(doc): def flattened_kernel(doc, path): for k, v in doc.items(): path.append(k) if isinstance(v, dict): for inner_k, inner_v in flattened_kernel(v, path): yield inner_k, inner_v elif isinstance(v, list): for li, lv in enumerate(v): path.append(str(li)) if isinstance(lv, dict): for dk, dv in flattened_kernel(lv, path): yield dk, dv else: yield ".".join(path), lv path.pop() else: yield ".".join(path), v path.pop() return dict(flattened_kernel(doc, [])) # Translate the _id field to whatever unique key we're using doc[self.unique_key] = doc["_id"] flat_doc = flattened(doc) # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any(regex.match(field) for regex in self._dynamic_field_regexes) return dict((k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc def stop(self): """ Stops the instance """ pass def upsert(self, doc): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ try: if self.auto_commit_interval is not None: self.solr.add( [self._clean_doc(doc)], commit=(self.auto_commit_interval == 0), commitWithin=str(self.auto_commit_interval), ) else: self.solr.add([self._clean_doc(doc)], commit=False) except SolrError: raise errors.OperationFailed("Could not insert %r into Solr" % bsjson.dumps(doc)) def bulk_upsert(self, docs): """Update or insert multiple documents into Solr docs may be any iterable """ try: cleaned = (self._clean_doc(d) for d in docs) if self.auto_commit_interval is not None: self.solr.add( cleaned, commit=(self.auto_commit_interval == 0), commitWithin=str(self.auto_commit_interval) ) else: self.solr.add(cleaned, commit=False) except SolrError: raise errors.OperationFailed("Could not bulk-insert documents into Solr") def remove(self, doc): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ self.solr.delete(id=str(doc[self.unique_key]), commit=(self.auto_commit_interval == 0)) def _remove(self): """Removes everything """ self.solr.delete(q="*:*", commit=(self.auto_commit_interval == 0)) def search(self, start_ts, end_ts): """Called to query Solr for documents in a time range. """ query = "_ts: [%s TO %s]" % (start_ts, end_ts) return self.solr.search(query, rows=100000000) def _search(self, query): """For test purposes only. Performs search on Solr with given query Does not have to be implemented. """ return self.solr.search(query, rows=200) def commit(self): """This function is used to force a commit. """ retry_until_ok(self.solr.commit) def get_last_doc(self): """Returns the last document stored in the Solr engine. """ # search everything, sort by descending timestamp, return 1 row try: result = self.solr.search("*:*", sort="_ts desc", rows=1) except ValueError: return None if len(result) == 0: return None return result.docs[0]
class SolrTestCase(unittest.TestCase): def setUp(self): super(SolrTestCase, self).setUp() self.default_solr = Solr('http://localhost:8983/solr/core0') # Short timeouts. self.solr = Solr('http://localhost:8983/solr/core0', timeout=2) self.docs = [ { 'id': 'doc_1', 'title': 'Example doc 1', 'price': 12.59, 'popularity': 10, }, { 'id': 'doc_2', 'title': 'Another example ☃ doc 2', 'price': 13.69, 'popularity': 7, }, { 'id': 'doc_3', 'title': 'Another thing', 'price': 2.35, 'popularity': 8, }, { 'id': 'doc_4', 'title': 'doc rock', 'price': 99.99, 'popularity': 10, }, { 'id': 'doc_5', 'title': 'Boring', 'price': 1.12, 'popularity': 2, }, ] # Clear it. self.solr.delete(q='*:*') # Index our docs. Yes, this leans on functionality we're going to test # later & if it's broken, everything will catastrophically fail. # Such is life. self.solr.add(self.docs) def tearDown(self): self.solr.delete(q='*:*') super(SolrTestCase, self).tearDown() def test_init(self): self.assertEqual(self.default_solr.url, 'http://localhost:8983/solr/core0') self.assertTrue(isinstance(self.default_solr.decoder, json.JSONDecoder)) self.assertEqual(self.default_solr.timeout, 60) self.assertEqual(self.solr.url, 'http://localhost:8983/solr/core0') self.assertTrue(isinstance(self.solr.decoder, json.JSONDecoder)) self.assertEqual(self.solr.timeout, 2) def test__create_full_url(self): # Nada. self.assertEqual(self.solr._create_full_url(path=''), 'http://localhost:8983/solr/core0') # Basic path. self.assertEqual(self.solr._create_full_url(path='pysolr_tests'), 'http://localhost:8983/solr/core0/pysolr_tests') # Leading slash (& making sure we don't touch the trailing slash). self.assertEqual( self.solr._create_full_url( path='/pysolr_tests/select/?whatever=/'), 'http://localhost:8983/solr/core0/pysolr_tests/select/?whatever=/') def test__send_request(self): # Test a valid request. resp_body = self.solr._send_request('GET', 'select/?q=doc&wt=json') self.assertTrue('"numFound":3' in resp_body) # Test a lowercase method & a body. xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee! ☃</field></doc></add>' resp_body = self.solr._send_request('POST', 'update/?commit=true', body=xml_body, headers={ 'Content-type': 'text/xml; charset=utf-8', }) self.assertTrue('<int name="status">0</int>' in resp_body) # Test a non-existent URL. old_url = self.solr.url self.solr.url = 'http://127.0.0.1:567898/wahtever' self.assertRaises(SolrError, self.solr._send_request, 'get', 'select/?q=doc&wt=json') self.solr.url = old_url def test__select(self): # Short params. resp_body = self.solr._select({'q': 'doc'}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 3) # Long params. resp_body = self.solr._select({'q': 'doc' * 1024}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 0) self.assertEqual(len(resp_data['responseHeader']['params']['q']), 3 * 1024) def test__mlt(self): resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 0) def test__suggest_terms(self): resp_body = self.solr._select({'terms.fl': 'title'}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 0) def test__update(self): xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>' resp_body = self.solr._update(xml_body) self.assertTrue('<int name="status">0</int>' in resp_body) def test__soft_commit(self): xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>' resp_body = self.solr._update(xml_body, softCommit=True) self.assertTrue('<int name="status">0</int>' in resp_body) def test__extract_error(self): class RubbishResponse(object): def __init__(self, content, headers=None): if isinstance(content, bytes): content = content.decode('utf-8') self.content = content self.headers = headers if self.headers is None: self.headers = {} def json(self): return json.loads(self.content) # Just the reason. resp_1 = RubbishResponse("We don't care.", {'reason': 'Something went wrong.'}) self.assertEqual(self.solr._extract_error(resp_1), "[Reason: Something went wrong.]") # Empty reason. resp_2 = RubbishResponse("We don't care.", {'reason': None}) self.assertEqual(self.solr._extract_error(resp_2), "[Reason: None]\nWe don't care.") # No reason. Time to scrape. resp_3 = RubbishResponse( '<html><body><pre>Something is broke.</pre></body></html>', {'server': 'jetty'}) self.assertEqual(self.solr._extract_error(resp_3), "[Reason: Something is broke.]") # No reason. JSON response. resp_4 = RubbishResponse(b'\n {"error": {"msg": "It happens"}}', {'server': 'tomcat'}) self.assertEqual(self.solr._extract_error(resp_4), "[Reason: It happens]") # No reason. Weird JSON response. resp_5 = RubbishResponse(b'{"kinda": "weird"}', {'server': 'jetty'}) self.assertEqual(self.solr._extract_error(resp_5), '[Reason: None]\n{"kinda": "weird"}') def test__scrape_response(self): # Jetty. resp_1 = self.solr._scrape_response( {'server': 'jetty'}, '<html><body><pre>Something is broke.</pre></body></html>') self.assertEqual(resp_1, ('Something is broke.', u'')) # Other. resp_2 = self.solr._scrape_response({ 'server': 'crapzilla' }, '<html><head><title>Wow. Seriously weird.</title></head><body><pre>Something is broke.</pre></body></html>' ) self.assertEqual(resp_2, ('Wow. Seriously weird.', u'')) @unittest.skipUnless(HAS_LXML, "Cannot test Tomcat error extraction without lxml") def test__scrape_response_tomcat(self): """Tests for Tomcat error responses, which currently require lxml.html to parse""" # Tomcat. resp_1 = self.solr._scrape_response({ 'server': 'coyote' }, '<html><body><p><span>Error message</span><span>messed up.</span></p></body></html>' ) self.assertEqual(resp_1, ('messed up.', '')) # Broken Tomcat. resp_2 = self.solr._scrape_response({ 'server': 'coyote' }, '<html><body><p>Really broken. Scraping Java-generated HTML sucks.</pre></body></html>' ) self.assertEqual(resp_2, ( None, u'<div><body><p>Really broken. Scraping Java-generated HTML sucks.</p></body></div>' )) def test__from_python(self): self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), '2013-01-18T00:00:00Z') self.assertEqual( self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), '2013-01-18T00:30:28Z') self.assertEqual(self.solr._from_python(True), 'true') self.assertEqual(self.solr._from_python(False), 'false') self.assertEqual(self.solr._from_python(1), '1') self.assertEqual(self.solr._from_python(1.2), '1.2') self.assertEqual(self.solr._from_python(b'hello'), 'hello') self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃') self.assertEqual(self.solr._from_python('\x01test\x02'), 'test') def test__to_python(self): self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18)) self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'), datetime.datetime(2013, 1, 18, 0, 30, 28)) self.assertEqual(self.solr._to_python('true'), True) self.assertEqual(self.solr._to_python('false'), False) self.assertEqual(self.solr._to_python(1), 1) self.assertEqual(self.solr._to_python(1.2), 1.2) self.assertEqual(self.solr._to_python(b'hello'), 'hello') self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃') self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo') self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo') self.assertEqual(self.solr._to_python('tuple("foo", "bar")'), 'tuple("foo", "bar")') def test__is_null_value(self): self.assertTrue(self.solr._is_null_value(None)) self.assertTrue(self.solr._is_null_value('')) self.assertFalse(self.solr._is_null_value('Hello')) self.assertFalse(self.solr._is_null_value(1)) def test_search(self): results = self.solr.search('doc') self.assertEqual(len(results), 3) results = self.solr.search('example') self.assertEqual(len(results), 2) results = self.solr.search('nothing') self.assertEqual(len(results), 0) # Advanced options. results = self.solr.search( 'doc', **{ 'debug': 'true', 'hl': 'true', 'hl.fragsize': 8, 'facet': 'on', 'facet.field': 'popularity', 'spellcheck': 'true', 'spellcheck.collate': 'true', 'spellcheck.count': 1, # TODO: Can't get these working in my test setup. # 'group': 'true', # 'group.field': 'id', }) self.assertEqual(len(results), 3) self.assertTrue('explain' in results.debug) self.assertEqual(results.highlighting, { u'doc_4': {}, u'doc_2': {}, u'doc_1': {} }) self.assertEqual(results.spellcheck, {}) self.assertEqual(results.facets['facet_fields']['popularity'], ['10', 2, '7', 1, '2', 0, '8', 0]) self.assertTrue(results.qtime is not None) # TODO: Can't get these working in my test setup. # self.assertEqual(results.grouped, '') def test_more_like_this(self): results = self.solr.more_like_this('id:doc_1', 'text') self.assertEqual(len(results), 0) def test_suggest_terms(self): results = self.solr.suggest_terms('title', '') self.assertEqual(len(results), 1) self.assertEqual( results, { 'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1), ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)] }) def test__build_doc(self): doc = { 'id': 'doc_1', 'title': 'Example doc ☃ 1', 'price': 12.59, 'popularity': 10, } doc_xml = force_unicode( ET.tostring(self.solr._build_doc(doc), encoding='utf-8')) self.assertTrue( '<field name="title">Example doc ☃ 1</field>' in doc_xml) self.assertTrue('<field name="id">doc_1</field>' in doc_xml) self.assertEqual(len(doc_xml), 152) def test_add(self): self.assertEqual(len(self.solr.search('doc')), 3) self.assertEqual(len(self.solr.search('example')), 2) self.solr.add([ { 'id': 'doc_6', 'title': 'Newly added doc', }, { 'id': 'doc_7', 'title': 'Another example doc', }, ]) self.assertEqual(len(self.solr.search('doc')), 5) self.assertEqual(len(self.solr.search('example')), 3) def test_add_with_boost(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([{ 'id': 'doc_6', 'title': 'Important doc' }], boost={'title': 10.0}) self.solr.add([{ 'id': 'doc_7', 'title': 'Spam doc doc' }], boost={'title': 0}) res = self.solr.search('doc') self.assertEqual(len(res), 5) self.assertEqual('doc_6', res.docs[0]['id']) def test_field_update(self): originalDocs = self.solr.search('doc') self.assertEqual(len(originalDocs), 3) updateList = [] for i, doc in enumerate(originalDocs): updateList.append({'id': doc['id'], 'popularity': 5}) self.solr.add(updateList, fieldUpdates={'popularity': 'inc'}) updatedDocs = self.solr.search('doc') self.assertEqual(len(updatedDocs), 3) for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)): self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys())) self.assertEqual(updatedDoc['popularity'], originalDoc['popularity'] + 5) self.assertEqual( True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'popularity'])) self.solr.add([ { 'id': 'multivalued_1', 'title': 'Multivalued doc 1', 'word_ss': ['alpha', 'beta'], }, { 'id': 'multivalued_2', 'title': 'Multivalued doc 2', 'word_ss': ['charlie', 'delta'], }, ]) originalDocs = self.solr.search('multivalued') self.assertEqual(len(originalDocs), 2) updateList = [] for i, doc in enumerate(originalDocs): updateList.append({ 'id': doc['id'], 'word_ss': ['epsilon', 'gamma'] }) self.solr.add(updateList, fieldUpdates={'word_ss': 'add'}) updatedDocs = self.solr.search('multivalued') self.assertEqual(len(updatedDocs), 2) for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)): self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys())) self.assertEqual(updatedDoc['word_ss'], originalDoc['word_ss'] + ['epsilon', 'gamma']) self.assertEqual( True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'word_ss'])) def test_delete(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.delete(id='doc_1') self.assertEqual(len(self.solr.search('doc')), 2) self.solr.delete(q='price:[0 TO 15]') self.assertEqual(len(self.solr.search('doc')), 1) self.assertEqual(len(self.solr.search('*:*')), 1) self.solr.delete(q='*:*') self.assertEqual(len(self.solr.search('*:*')), 0) # Need at least one. self.assertRaises(ValueError, self.solr.delete) # Can't have both. self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar') def test_commit(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([{ 'id': 'doc_6', 'title': 'Newly added doc', }], commit=False) self.assertEqual(len(self.solr.search('doc')), 3) self.solr.commit() self.assertEqual(len(self.solr.search('doc')), 4) def test_optimize(self): # Make sure it doesn't blow up. Side effects are hard to measure. :/ self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([{ 'id': 'doc_6', 'title': 'Newly added doc', }], commit=False) self.assertEqual(len(self.solr.search('doc')), 3) self.solr.optimize() self.assertEqual(len(self.solr.search('doc')), 4) def test_extract(self): fake_f = StringIO(""" <html> <head> <meta charset="utf-8"> <meta name="haystack-test" content="test 1234"> <title>Test Title ☃☃</title> </head> <body>foobar</body> </html> """) fake_f.name = "test.html" extracted = self.solr.extract(fake_f) # Verify documented response structure: self.assertIn('contents', extracted) self.assertIn('metadata', extracted) self.assertIn('foobar', extracted['contents']) m = extracted['metadata'] self.assertEqual([fake_f.name], m['stream_name']) self.assertIn('haystack-test', m, "HTML metadata should have been extracted!") self.assertEqual(['test 1234'], m['haystack-test']) # Note the underhanded use of a double snowman to verify both that Tika # correctly decoded entities and that our UTF-8 characters survived the # round-trip: self.assertEqual(['Test Title ☃☃'], m['title']) def test_full_url(self): self.solr.url = 'http://localhost:8983/solr/core0' full_url = self.solr._create_full_url(path='/update') # Make sure trailing and leading slashes do not collide: self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update')
class SearchEngine(BaseSearchEngine): def __init__(self): args = [settings.SOLR_URL] self.conn = Solr(*args) def _models_query(self, models): def qt(model): return 'django_ct_s:"%s.%s"' % (model._meta.app_label, model._meta.module_name) return ' OR '.join([qt(model) for model in models]) def update(self, indexer, iterable, commit=True): docs = [] try: for obj in iterable: doc = {} doc['id'] = self.get_identifier(obj) doc['django_ct_s'] = "%s.%s" % (obj._meta.app_label, obj._meta.module_name) doc['django_id_s'] = force_unicode(obj.pk) doc['text'] = indexer.flatten(obj) for name, value in indexer.get_indexed_fields(obj): doc[name] = value docs.append(doc) except UnicodeDecodeError: print "Chunk failed." pass self.conn.add(docs, commit=commit) def remove(self, obj, commit=True): solr_id = self.get_identifier(obj) self.conn.delete(id=solr_id, commit=commit) def clear(self, models, commit=True): # *:* matches all docs in Solr self.conn.delete(q='*:*', commit=commit) def _result_callback(self, result): app_label, model_name = result['django_ct_s'].split('.') return (app_label, model_name, result['django_id_s'], None) def search(self, q, models=None, order_by=RELEVANCE, limit=None, offset=None): if len(q) == 0: return SearchResults(q, [], 0, lambda x: x) original_query = q q = convert_query(original_query, SolrQueryConverter) if models is not None: models_clause = self._models_query(models) final_q = '(%s) AND (%s)' % (q, models_clause) else: final_q = q kwargs = {} if order_by != RELEVANCE: if order_by[0] == '-': kwargs['sort'] = '%s desc' % order_by[1:] else: kwargs['sort'] = '%s asc' % order_by if limit is not None: kwargs['rows'] = limit if offset is not None: kwargs['start'] = offset results = self.conn.search(final_q, **kwargs) return SearchResults(final_q, iter(results.docs), results.hits, self._result_callback)
class SolrSearchBackend(BaseSearchBackend): # Word reserved by Solr for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Solr for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '/', ) def __init__(self, connection_alias, **connection_options): super(SolrSearchBackend, self).__init__(connection_alias, **connection_options) if not 'URL' in connection_options: raise ImproperlyConfigured("You must specify a 'URL' in your settings for connection '%s'." % connection_alias) self.conn = Solr(connection_options['URL'], timeout=self.timeout, **connection_options.get('KWARGS', {})) self.log = logging.getLogger('haystack') def update(self, index, iterable, commit=True): docs = [] for obj in iterable: try: docs.append(index.full_prepare(obj)) except SkipDocument: self.log.debug(u"Indexing for object `%s` skipped", obj) except UnicodeDecodeError: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"UnicodeDecodeError while preparing object for update", exc_info=True, extra={"data": {"index": index, "object": get_identifier(obj)}}) if len(docs) > 0: try: self.conn.add(docs, commit=commit, boost=index.get_field_weights()) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error("Failed to add documents to Solr: %s", e, exc_info=True) def remove(self, obj_or_string, commit=True): solr_id = get_identifier(obj_or_string) try: kwargs = { 'commit': commit, 'id': solr_id } self.conn.delete(**kwargs) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Solr: %s", solr_id, e, exc_info=True) def clear(self, models=None, commit=True): if models is not None: assert isinstance(models, (list, tuple)) try: if models is None: # *:* matches all docs in Solr self.conn.delete(q='*:*', commit=commit) else: models_to_delete = [] for model in models: models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model))) self.conn.delete(q=" OR ".join(models_to_delete), commit=commit) if commit: # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99 self.conn.optimize() except (IOError, SolrError) as e: if not self.silently_fail: raise if models is not None: self.log.error("Failed to clear Solr index of models '%s': %s", ','.join(models_to_delete), e, exc_info=True) else: self.log.error("Failed to clear Solr index: %s", e, exc_info=True) @log_query def search(self, query_string, **kwargs): if len(query_string) == 0: return { 'results': [], 'hits': 0, } search_kwargs = self.build_search_kwargs(query_string, **kwargs) try: raw_results = self.conn.search(query_string, **search_kwargs) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error("Failed to query Solr using '%s': %s", query_string, e, exc_info=True) raw_results = EmptyResults() return self._process_results(raw_results, highlight=kwargs.get('highlight'), result_class=kwargs.get('result_class', SearchResult), distance_point=kwargs.get('distance_point')) def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, stats=None, **extra_kwargs): kwargs = {'fl': '* score'} if fields: if isinstance(fields, (list, set)): fields = " ".join(fields) kwargs['fl'] = fields if sort_by is not None: if sort_by in ['distance asc', 'distance desc'] and distance_point: # Do the geo-enabled sort. lng, lat = distance_point['point'].get_coords() kwargs['sfield'] = distance_point['field'] kwargs['pt'] = '%s,%s' % (lat, lng) if sort_by == 'distance asc': kwargs['sort'] = 'geodist() asc' else: kwargs['sort'] = 'geodist() desc' else: if sort_by.startswith('distance '): warnings.warn("In order to sort by distance, you must call the '.distance(...)' method.") # Regular sorting. kwargs['sort'] = sort_by if start_offset is not None: kwargs['start'] = start_offset if end_offset is not None: kwargs['rows'] = end_offset - start_offset if highlight: # `highlight` can either be True or a dictionary containing custom parameters # which will be passed to the backend and may override our default settings: kwargs['hl'] = 'true' kwargs['hl.fragsize'] = '200' if isinstance(highlight, dict): # autoprefix highlighter options with 'hl.', all of them start with it anyway # this makes option dicts shorter: {'maxAnalyzedChars': 42} # and lets some of options be used as keyword arguments: `.highlight(preserveMulti=False)` kwargs.update({ key if key.startswith("hl.") else ('hl.' + key): highlight[key] for key in highlight.keys() }) if self.include_spelling is True: kwargs['spellcheck'] = 'true' kwargs['spellcheck.collate'] = 'true' kwargs['spellcheck.count'] = 1 if spelling_query: kwargs['spellcheck.q'] = spelling_query if facets is not None: kwargs['facet'] = 'on' kwargs['facet.field'] = facets.keys() for facet_field, options in facets.items(): for key, value in options.items(): kwargs['f.%s.facet.%s' % (facet_field, key)] = self.conn._from_python(value) if date_facets is not None: kwargs['facet'] = 'on' kwargs['facet.date'] = date_facets.keys() kwargs['facet.date.other'] = 'none' for key, value in date_facets.items(): kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date')) kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date')) gap_by_string = value.get('gap_by').upper() gap_string = "%d%s" % (value.get('gap_amount'), gap_by_string) if value.get('gap_amount') != 1: gap_string += "S" kwargs["f.%s.facet.date.gap" % key] = '+%s/%s' % (gap_string, gap_by_string) if query_facets is not None: kwargs['facet'] = 'on' kwargs['facet.query'] = ["%s:%s" % (field, value) for field, value in query_facets] if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices))) if narrow_queries is not None: kwargs['fq'] = list(narrow_queries) if stats: kwargs['stats'] = "true" for k in stats.keys(): kwargs['stats.field'] = k for facet in stats[k]: kwargs['f.%s.stats.facet' % k] = facet if within is not None: from haystack.utils.geo import generate_bounding_box kwargs.setdefault('fq', []) ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(within['point_1'], within['point_2']) # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT* # very clear on this. bbox = '%s:[%s,%s TO %s,%s]' % (within['field'], min_lat, min_lng, max_lat, max_lng) kwargs['fq'].append(bbox) if dwithin is not None: kwargs.setdefault('fq', []) lng, lat = dwithin['point'].get_coords() geofilt = '{!geofilt pt=%s,%s sfield=%s d=%s}' % (lat, lng, dwithin['field'], dwithin['distance'].km) kwargs['fq'].append(geofilt) # Check to see if the backend should try to include distances # (Solr 4.X+) in the results. if self.distance_available and distance_point: # In early testing, you can't just hand Solr 4.X a proper bounding box # & request distances. To enable native distance would take calculating # a center point & a radius off the user-provided box, which kinda # sucks. We'll avoid it for now, since Solr 4.x's release will be some # time yet. # kwargs['fl'] += ' _dist_:geodist()' pass if extra_kwargs: kwargs.update(extra_kwargs) return kwargs def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): from haystack import connections # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model index = connections[self.connection_alias].get_unified_index().get_index(model_klass) field_name = index.get_content_field() params = { 'fl': '*,score', } if start_offset is not None: params['start'] = start_offset if end_offset is not None: params['rows'] = end_offset narrow_queries = set() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices))) if additional_query_string: narrow_queries.add(additional_query_string) if narrow_queries: params['fq'] = list(narrow_queries) query = "%s:%s" % (ID, get_identifier(model_instance)) try: raw_results = self.conn.more_like_this(query, field_name, **params) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error("Failed to fetch More Like This from Solr for document '%s': %s", query, e, exc_info=True) raw_results = EmptyResults() return self._process_results(raw_results, result_class=result_class) def _process_results(self, raw_results, highlight=False, result_class=None, distance_point=None): from haystack import connections results = [] hits = raw_results.hits facets = {} stats = {} spelling_suggestion = None if result_class is None: result_class = SearchResult if hasattr(raw_results,'stats'): stats = raw_results.stats.get('stats_fields',{}) if hasattr(raw_results, 'facets'): facets = { 'fields': raw_results.facets.get('facet_fields', {}), 'dates': raw_results.facets.get('facet_dates', {}), 'queries': raw_results.facets.get('facet_queries', {}), } for key in ['fields']: for facet_field in facets[key]: # Convert to a two-tuple, as Solr's json format returns a list of # pairs. facets[key][facet_field] = list(zip(facets[key][facet_field][::2], facets[key][facet_field][1::2])) if self.include_spelling and hasattr(raw_results, 'spellcheck'): # Solr 5+ changed the JSON response format so the suggestions will be key-value mapped rather # than simply paired elements in a list, which is a nice improvement but incompatible with # Solr 4: https://issues.apache.org/jira/browse/SOLR-3029 if len(raw_results.spellcheck.get('collations', [])): spelling_suggestion = raw_results.spellcheck['collations'][-1] elif len(raw_results.spellcheck.get('suggestions', [])): spelling_suggestion = raw_results.spellcheck['suggestions'][-1] if isinstance(spelling_suggestion, dict): spelling_suggestion = spelling_suggestion.get('suggestion', [None])[-1] # Solr 5+ JSON response format if isinstance(spelling_suggestion, dict): # Solr setting: spellcheck.extendedResults = true spelling_suggestion = spelling_suggestion['word'] assert spelling_suggestion is None or isinstance(spelling_suggestion, six.string_types) unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for raw_result in raw_results.docs: app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = haystack_get_model(app_label, model_name) if model and model in indexed_models: index = unified_index.get_index(model) index_field_map = index.field_map for key, value in raw_result.items(): string_key = str(key) # re-map key if alternate name used if string_key in index_field_map: string_key = index_field_map[key] if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self.conn._to_python(value) del(additional_fields[DJANGO_CT]) del(additional_fields[DJANGO_ID]) del(additional_fields['score']) if raw_result[ID] in getattr(raw_results, 'highlighting', {}): additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]] if distance_point: additional_fields['_point_of_origin'] = distance_point if raw_result.get('__dist__'): from haystack.utils.geo import Distance additional_fields['_distance'] = Distance(km=float(raw_result['__dist__'])) else: additional_fields['_distance'] = None result = result_class(app_label, model_name, raw_result[DJANGO_ID], raw_result['score'], **additional_fields) results.append(result) else: hits -= 1 return { 'results': results, 'hits': hits, 'stats': stats, 'facets': facets, 'spelling_suggestion': spelling_suggestion, } def build_schema(self, fields): content_field_name = '' schema_fields = [] for field_name, field_class in fields.items(): field_data = { 'field_name': field_class.index_fieldname, 'type': 'text_en', 'indexed': 'true', 'stored': 'true', 'multi_valued': 'false', } if field_class.document is True: content_field_name = field_class.index_fieldname # DRL_FIXME: Perhaps move to something where, if none of these # checks succeed, call a custom method on the form that # returns, per-backend, the right type of storage? if field_class.field_type in ['date', 'datetime']: field_data['type'] = 'date' elif field_class.field_type == 'integer': field_data['type'] = 'long' elif field_class.field_type == 'float': field_data['type'] = 'float' elif field_class.field_type == 'boolean': field_data['type'] = 'boolean' elif field_class.field_type == 'ngram': field_data['type'] = 'ngram' elif field_class.field_type == 'edge_ngram': field_data['type'] = 'edge_ngram' elif field_class.field_type == 'location': field_data['type'] = 'location' if field_class.is_multivalued: field_data['multi_valued'] = 'true' if field_class.stored is False: field_data['stored'] = 'false' # Do this last to override `text` fields. if field_class.indexed is False: field_data['indexed'] = 'false' # If it's text and not being indexed, we probably don't want # to do the normal lowercase/tokenize/stemming/etc. dance. if field_data['type'] == 'text_en': field_data['type'] = 'string' # If it's a ``FacetField``, make sure we don't postprocess it. if hasattr(field_class, 'facet_for'): # If it's text, it ought to be a string. if field_data['type'] == 'text_en': field_data['type'] = 'string' schema_fields.append(field_data) return (content_field_name, schema_fields) def extract_file_contents(self, file_obj): """Extract text and metadata from a structured file (PDF, MS Word, etc.) Uses the Solr ExtractingRequestHandler, which is based on Apache Tika. See the Solr wiki for details: http://wiki.apache.org/solr/ExtractingRequestHandler Due to the way the ExtractingRequestHandler is implemented it completely replaces the normal Haystack indexing process with several unfortunate restrictions: only one file per request, the extracted data is added to the index with no ability to modify it, etc. To simplify the process and allow for more advanced use we'll run using the extract-only mode to return the extracted data without adding it to the index so we can then use it within Haystack's normal templating process. Returns None if metadata cannot be extracted; otherwise returns a dictionary containing at least two keys: :contents: Extracted full-text content, if applicable :metadata: key:value pairs of text strings """ try: return self.conn.extract(file_obj) except Exception as e: self.log.warning(u"Unable to extract file contents: %s", e, exc_info=True, extra={"data": {"file": file_obj}}) return None
class SolrSearchBackend(BaseSearchBackend): # Word reserved by Solr for special use. RESERVED_WORDS = ("AND", "NOT", "OR", "TO") # Characters reserved by Solr for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( "\\", "+", "-", "&&", "||", "!", "(", ")", "{", "}", "[", "]", "^", '"', "~", "*", "?", ":", "/", ) def __init__(self, connection_alias, **connection_options): super(SolrSearchBackend, self).__init__(connection_alias, **connection_options) if "URL" not in connection_options: raise ImproperlyConfigured( "You must specify a 'URL' in your settings for connection '%s'." % connection_alias) self.collate = connection_options.get("COLLATE_SPELLING", True) self.conn = Solr(connection_options["URL"], timeout=self.timeout, **connection_options.get("KWARGS", {})) self.log = logging.getLogger("haystack") def update(self, index, iterable, commit=True): docs = [] for obj in iterable: try: docs.append(index.full_prepare(obj)) except SkipDocument: self.log.debug("Indexing for object `%s` skipped", obj) except UnicodeDecodeError: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error( "UnicodeDecodeError while preparing object for update", exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }, ) if len(docs) > 0: try: self.conn.add(docs, commit=commit, boost=index.get_field_weights()) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error("Failed to add documents to Solr: %s", e, exc_info=True) def remove(self, obj_or_string, commit=True): solr_id = get_identifier(obj_or_string) try: kwargs = {"commit": commit, "id": solr_id} self.conn.delete(**kwargs) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error( "Failed to remove document '%s' from Solr: %s", solr_id, e, exc_info=True, ) def clear(self, models=None, commit=True): if models is not None: assert isinstance(models, (list, tuple)) try: if models is None: # *:* matches all docs in Solr self.conn.delete(q="*:*", commit=commit) else: models_to_delete = [] for model in models: models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model))) self.conn.delete(q=" OR ".join(models_to_delete), commit=commit) if commit: # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99 self.conn.optimize() except (IOError, SolrError) as e: if not self.silently_fail: raise if models is not None: self.log.error( "Failed to clear Solr index of models '%s': %s", ",".join(models_to_delete), e, exc_info=True, ) else: self.log.error("Failed to clear Solr index: %s", e, exc_info=True) @log_query def search(self, query_string, **kwargs): if len(query_string) == 0: return {"results": [], "hits": 0} search_kwargs = self.build_search_kwargs(query_string, **kwargs) try: raw_results = self.conn.search(query_string, **search_kwargs) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error("Failed to query Solr using '%s': %s", query_string, e, exc_info=True) raw_results = EmptyResults() return self._process_results( raw_results, highlight=kwargs.get("highlight"), result_class=kwargs.get("result_class", SearchResult), distance_point=kwargs.get("distance_point"), ) def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields="", highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, stats=None, collate=None, **extra_kwargs): index = haystack.connections[self.connection_alias].get_unified_index() kwargs = {"fl": "* score", "df": index.document_field} if fields: if isinstance(fields, (list, set)): fields = " ".join(fields) kwargs["fl"] = fields if sort_by is not None: if sort_by in ["distance asc", "distance desc"] and distance_point: # Do the geo-enabled sort. lng, lat = distance_point["point"].coords kwargs["sfield"] = distance_point["field"] kwargs["pt"] = "%s,%s" % (lat, lng) if sort_by == "distance asc": kwargs["sort"] = "geodist() asc" else: kwargs["sort"] = "geodist() desc" else: if sort_by.startswith("distance "): warnings.warn( "In order to sort by distance, you must call the '.distance(...)' method." ) # Regular sorting. kwargs["sort"] = sort_by if start_offset is not None: kwargs["start"] = start_offset if end_offset is not None: kwargs["rows"] = end_offset - start_offset if highlight: # `highlight` can either be True or a dictionary containing custom parameters # which will be passed to the backend and may override our default settings: kwargs["hl"] = "true" kwargs["hl.fragsize"] = "200" if isinstance(highlight, dict): # autoprefix highlighter options with 'hl.', all of them start with it anyway # this makes option dicts shorter: {'maxAnalyzedChars': 42} # and lets some of options be used as keyword arguments: `.highlight(preserveMulti=False)` kwargs.update({ key if key.startswith("hl.") else ("hl." + key): highlight[key] for key in highlight.keys() }) if collate is None: collate = self.collate if self.include_spelling is True: kwargs["spellcheck"] = "true" kwargs["spellcheck.collate"] = str(collate).lower() kwargs["spellcheck.count"] = 1 if spelling_query: kwargs["spellcheck.q"] = spelling_query if facets is not None: kwargs["facet"] = "on" kwargs["facet.field"] = facets.keys() for facet_field, options in facets.items(): for key, value in options.items(): kwargs["f.%s.facet.%s" % (facet_field, key)] = self.conn._from_python(value) if date_facets is not None: kwargs["facet"] = "on" kwargs["facet.date"] = date_facets.keys() kwargs["facet.date.other"] = "none" for key, value in date_facets.items(): kwargs["f.%s.facet.date.start" % key] = self.conn._from_python( value.get("start_date")) kwargs["f.%s.facet.date.end" % key] = self.conn._from_python( value.get("end_date")) gap_by_string = value.get("gap_by").upper() gap_string = "%d%s" % (value.get("gap_amount"), gap_by_string) if value.get("gap_amount") != 1: gap_string += "S" kwargs["f.%s.facet.date.gap" % key] = "+%s/%s" % ( gap_string, gap_by_string, ) if query_facets is not None: kwargs["facet"] = "on" kwargs["facet.query"] = [ "%s:%s" % (field, value) for field, value in query_facets ] if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add("%s:(%s)" % (DJANGO_CT, " OR ".join(model_choices))) if narrow_queries is not None: kwargs["fq"] = list(narrow_queries) if stats: kwargs["stats"] = "true" for k in stats.keys(): kwargs["stats.field"] = k for facet in stats[k]: kwargs["f.%s.stats.facet" % k] = facet if within is not None: from haystack.utils.geo import generate_bounding_box kwargs.setdefault("fq", []) ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box( within["point_1"], within["point_2"]) # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT* # very clear on this. bbox = "%s:[%s,%s TO %s,%s]" % ( within["field"], min_lat, min_lng, max_lat, max_lng, ) kwargs["fq"].append(bbox) if dwithin is not None: kwargs.setdefault("fq", []) lng, lat = dwithin["point"].coords geofilt = "{!geofilt pt=%s,%s sfield=%s d=%s}" % ( lat, lng, dwithin["field"], dwithin["distance"].km, ) kwargs["fq"].append(geofilt) # Check to see if the backend should try to include distances # (Solr 4.X+) in the results. if self.distance_available and distance_point: # In early testing, you can't just hand Solr 4.X a proper bounding box # & request distances. To enable native distance would take calculating # a center point & a radius off the user-provided box, which kinda # sucks. We'll avoid it for now, since Solr 4.x's release will be some # time yet. # kwargs['fl'] += ' _dist_:geodist()' pass if extra_kwargs: kwargs.update(extra_kwargs) return kwargs def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): from haystack import connections # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model index = (connections[ self.connection_alias].get_unified_index().get_index(model_klass)) field_name = index.get_content_field() params = {"fl": "*,score"} if start_offset is not None: params["start"] = start_offset if end_offset is not None: params["rows"] = end_offset narrow_queries = set() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add("%s:(%s)" % (DJANGO_CT, " OR ".join(model_choices))) if additional_query_string: narrow_queries.add(additional_query_string) if narrow_queries: params["fq"] = list(narrow_queries) query = "%s:%s" % (ID, get_identifier(model_instance)) try: raw_results = self.conn.more_like_this(query, field_name, **params) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error( "Failed to fetch More Like This from Solr for document '%s': %s", query, e, exc_info=True, ) raw_results = EmptyResults() return self._process_results(raw_results, result_class=result_class) def _process_results(self, raw_results, highlight=False, result_class=None, distance_point=None): from haystack import connections results = [] hits = raw_results.hits facets = {} stats = {} spelling_suggestion = spelling_suggestions = None if result_class is None: result_class = SearchResult if hasattr(raw_results, "stats"): stats = raw_results.stats.get("stats_fields", {}) if hasattr(raw_results, "facets"): facets = { "fields": raw_results.facets.get("facet_fields", {}), "dates": raw_results.facets.get("facet_dates", {}), "queries": raw_results.facets.get("facet_queries", {}), } for key in ["fields"]: for facet_field in facets[key]: # Convert to a two-tuple, as Solr's json format returns a list of # pairs. facets[key][facet_field] = list( zip( facets[key][facet_field][::2], facets[key][facet_field][1::2], )) if self.include_spelling and hasattr(raw_results, "spellcheck"): try: spelling_suggestions = self.extract_spelling_suggestions( raw_results) except Exception as exc: self.log.error( "Error extracting spelling suggestions: %s", exc, exc_info=True, extra={"data": { "spellcheck": raw_results.spellcheck }}, ) if not self.silently_fail: raise spelling_suggestions = None if spelling_suggestions: # Maintain compatibility with older versions of Haystack which returned a single suggestion: spelling_suggestion = spelling_suggestions[-1] assert isinstance(spelling_suggestion, str) else: spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for raw_result in raw_results.docs: app_label, model_name = raw_result[DJANGO_CT][0].split(".") additional_fields = {} model = haystack_get_model(app_label, model_name) if model and model in indexed_models: index = unified_index.get_index(model) index_field_map = index.field_map for key, value in raw_result.items(): string_key = str(key) # re-map key if alternate name used if string_key in index_field_map: string_key = index_field_map[key] if string_key in index.fields and hasattr( index.fields[string_key], "convert"): additional_fields[string_key] = index.fields[ string_key].convert(value[0]) else: additional_fields[string_key] = self.conn._to_python( value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) del (additional_fields["score"]) if raw_result[ID] in getattr(raw_results, "highlighting", {}): additional_fields[ "highlighted"] = raw_results.highlighting[ raw_result[ID]] if distance_point: additional_fields["_point_of_origin"] = distance_point if raw_result.get("__dist__"): from django.contrib.gis.measure import Distance additional_fields["_distance"] = Distance( km=float(raw_result["__dist__"])) else: additional_fields["_distance"] = None result = result_class(app_label, model_name, raw_result[DJANGO_ID][0], raw_result["score"], **additional_fields) results.append(result) else: hits -= 1 return { "results": results, "hits": hits, "stats": stats, "facets": facets, "spelling_suggestion": spelling_suggestion, "spelling_suggestions": spelling_suggestions, } def extract_spelling_suggestions(self, raw_results): # There are many different formats for Legacy, 6.4, and 6.5 e.g. # https://issues.apache.org/jira/browse/SOLR-3029 and depending on the # version and configuration the response format may be a dict of dicts, # a list of dicts, or a list of strings. collations = raw_results.spellcheck.get("collations", None) suggestions = raw_results.spellcheck.get("suggestions", None) # We'll collect multiple suggestions here. For backwards # compatibility with older versions of Haystack we'll still return # only a single suggestion but in the future we can expose all of # them. spelling_suggestions = [] if collations: if isinstance(collations, dict): # Solr 6.5 collation_values = collations["collation"] if isinstance(collation_values, str): collation_values = [collation_values] elif isinstance(collation_values, dict): # spellcheck.collateExtendedResults changes the format to a dictionary: collation_values = [collation_values["collationQuery"]] elif isinstance(collations[1], dict): # Solr 6.4 collation_values = collations else: # Older versions of Solr collation_values = collations[-1:] for i in collation_values: # Depending on the options the values are either simple strings or dictionaries: spelling_suggestions.append( i["collationQuery"] if isinstance(i, dict) else i) elif suggestions: if isinstance(suggestions, dict): for i in suggestions.values(): for j in i["suggestion"]: if isinstance(j, dict): spelling_suggestions.append(j["word"]) else: spelling_suggestions.append(j) elif isinstance(suggestions[0], str) and isinstance( suggestions[1], dict): # Solr 6.4 uses a list of paired (word, dictionary) pairs: for suggestion in suggestions: if isinstance(suggestion, dict): for i in suggestion["suggestion"]: if isinstance(i, dict): spelling_suggestions.append(i["word"]) else: spelling_suggestions.append(i) else: # Legacy Solr spelling_suggestions.append(suggestions[-1]) return spelling_suggestions def build_schema(self, fields): content_field_name = "" schema_fields = [] for field_name, field_class in fields.items(): field_data = { "field_name": field_class.index_fieldname, "type": "text_en", "indexed": "true", "stored": "true", "multi_valued": "false", } if field_class.document is True: content_field_name = field_class.index_fieldname # DRL_FIXME: Perhaps move to something where, if none of these # checks succeed, call a custom method on the form that # returns, per-backend, the right type of storage? if field_class.field_type in ["date", "datetime"]: field_data["type"] = "date" elif field_class.field_type == "integer": field_data["type"] = "long" elif field_class.field_type == "float": field_data["type"] = "float" elif field_class.field_type == "boolean": field_data["type"] = "boolean" elif field_class.field_type == "ngram": field_data["type"] = "ngram" elif field_class.field_type == "edge_ngram": field_data["type"] = "edge_ngram" elif field_class.field_type == "location": field_data["type"] = "location" if field_class.is_multivalued: field_data["multi_valued"] = "true" if field_class.stored is False: field_data["stored"] = "false" # Do this last to override `text` fields. if field_class.indexed is False: field_data["indexed"] = "false" # If it's text and not being indexed, we probably don't want # to do the normal lowercase/tokenize/stemming/etc. dance. if field_data["type"] == "text_en": field_data["type"] = "string" # If it's a ``FacetField``, make sure we don't postprocess it. if hasattr(field_class, "facet_for"): # If it's text, it ought to be a string. if field_data["type"] == "text_en": field_data["type"] = "string" schema_fields.append(field_data) return (content_field_name, schema_fields) def extract_file_contents(self, file_obj, **kwargs): """Extract text and metadata from a structured file (PDF, MS Word, etc.) Uses the Solr ExtractingRequestHandler, which is based on Apache Tika. See the Solr wiki for details: http://wiki.apache.org/solr/ExtractingRequestHandler Due to the way the ExtractingRequestHandler is implemented it completely replaces the normal Haystack indexing process with several unfortunate restrictions: only one file per request, the extracted data is added to the index with no ability to modify it, etc. To simplify the process and allow for more advanced use we'll run using the extract-only mode to return the extracted data without adding it to the index so we can then use it within Haystack's normal templating process. Returns None if metadata cannot be extracted; otherwise returns a dictionary containing at least two keys: :contents: Extracted full-text content, if applicable :metadata: key:value pairs of text strings """ try: return self.conn.extract(file_obj, **kwargs) except Exception as e: self.log.warning( "Unable to extract file contents: %s", e, exc_info=True, extra={"data": { "file": file_obj }}, ) return None
class DocManager(): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', **kwargs): """Verify Solr URL and establish a connection. """ self.solr = Solr(url) self.unique_key = unique_key # pysolr does things in milliseconds if auto_commit_interval is not None: self.auto_commit_interval = auto_commit_interval * 1000 else: self.auto_commit_interval = None self.field_list = [] self._build_fields() def _parse_fields(self, result, field_name): """ If Schema access, parse fields and build respective lists """ field_list = [] for key, value in result.get('schema', {}).get(field_name, {}).items(): if key not in field_list: field_list.append(key) return field_list def _build_fields(self): """ Builds a list of valid fields """ declared_fields = self.solr._send_request('get', ADMIN_URL) result = decoder.decode(declared_fields) self.field_list = self._parse_fields(result, 'fields') # Build regular expressions to match dynamic fields. # dynamic field names may have exactly one wildcard, either at # the beginning or the end of the name self._dynamic_field_regexes = [] for wc_pattern in self._parse_fields(result, 'dynamicFields'): if wc_pattern[0] == "*": self._dynamic_field_regexes.append( re.compile("\w%s\Z" % wc_pattern)) elif wc_pattern[-1] == "*": self._dynamic_field_regexes.append( re.compile("\A%s\w*" % wc_pattern[:-1])) def _clean_doc(self, doc): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key def flattened(doc): def flattened_kernel(doc, path): for k, v in doc.items(): path.append(k) if isinstance(v, dict): for inner_k, inner_v in flattened_kernel(v, path): yield inner_k, inner_v elif isinstance(v, list): for li, lv in enumerate(v): path.append(str(li)) if isinstance(lv, dict): for dk, dv in flattened_kernel(lv, path): yield dk, dv else: yield ".".join(path), lv path.pop() else: yield ".".join(path), v path.pop() return dict(flattened_kernel(doc, [])) # Translate the _id field to whatever unique key we're using doc[self.unique_key] = doc["_id"] flat_doc = doc # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any( regex.match(field) for regex in self._dynamic_field_regexes) return dict( (k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc def stop(self): """ Stops the instance """ pass def upsert(self, doc): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ try: if self.auto_commit_interval is not None: self.solr.add([self._clean_doc(doc)], commit=(self.auto_commit_interval == 0), commitWithin=str(self.auto_commit_interval)) else: self.solr.add([self._clean_doc(doc)], commit=False) except SolrError: raise errors.OperationFailed("Could not insert %r into Solr" % bsjson.dumps(doc)) def bulk_upsert(self, docs): """Update or insert multiple documents into Solr docs may be any iterable """ try: cleaned = (self._clean_doc(d) for d in docs) if self.auto_commit_interval is not None: self.solr.add(cleaned, commit=(self.auto_commit_interval == 0), commitWithin=str(self.auto_commit_interval)) else: self.solr.add(cleaned, commit=False) except SolrError: raise errors.OperationFailed( "Could not bulk-insert documents into Solr") def remove(self, doc): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ self.solr.delete(id=str(doc[self.unique_key]), commit=(self.auto_commit_interval == 0)) def _remove(self): """Removes everything """ self.solr.delete(q='*:*', commit=(self.auto_commit_interval == 0)) def search(self, start_ts, end_ts): """Called to query Solr for documents in a time range. """ query = '_ts: [%s TO %s]' % (start_ts, end_ts) return self.solr.search(query, rows=100000000) def _search(self, query): """For test purposes only. Performs search on Solr with given query Does not have to be implemented. """ return self.solr.search(query, rows=200) def commit(self): """This function is used to force a commit. """ retry_until_ok(self.solr.commit) def get_last_doc(self): """Returns the last document stored in the Solr engine. """ #search everything, sort by descending timestamp, return 1 row try: result = self.solr.search('*:*', sort='_ts desc', rows=1) except ValueError: return None if len(result) == 0: return None return result.docs[0]
class SolrSearchBackend(BaseSearchBackend): # Word reserved by Solr for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Solr for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '/', ) def __init__(self, connection_alias, **connection_options): super(SolrSearchBackend, self).__init__(connection_alias, **connection_options) if not 'URL' in connection_options: raise ImproperlyConfigured("You must specify a 'URL' in your settings for connection '%s'." % connection_alias) self.conn = Solr(connection_options['URL'], timeout=self.timeout, **connection_options.get('KWARGS', {})) self.log = logging.getLogger('haystack') def get_schema_admin(self): ''' SolrSchemaAdmin singleton ''' if not hasattr(self, '_schema_admin'): self._schema_admin = SolrSchemaAdmin(self.conn.url, self.conn.session) return self._schema_admin def update(self, index, iterable, commit=True): docs = [] for obj in iterable: try: docs.append(index.full_prepare(obj)) except SkipDocument: self.log.debug(u"Indexing for object `%s` skipped", obj) except UnicodeDecodeError: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"UnicodeDecodeError while preparing object for update", exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) if len(docs) > 0: try: self.conn.add(docs, commit=commit, boost=index.get_field_weights()) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error("Failed to add documents to Solr: %s", e) def remove(self, obj_or_string, commit=True): solr_id = get_identifier(obj_or_string) try: kwargs = { 'commit': commit, 'id': solr_id } self.conn.delete(**kwargs) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Solr: %s", solr_id, e) def clear(self, models=[], commit=True): try: if not models: # *:* matches all docs in Solr self.conn.delete(q='*:*', commit=commit) else: models_to_delete = [] for model in models: models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model))) self.conn.delete(q=" OR ".join(models_to_delete), commit=commit) if commit: # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99 self.conn.optimize() except (IOError, SolrError) as e: if not self.silently_fail: raise if len(models): self.log.error("Failed to clear Solr index of models '%s': %s", ','.join(models_to_delete), e) else: self.log.error("Failed to clear Solr index: %s", e) @log_query def search(self, query_string, **kwargs): if len(query_string) == 0: return { 'results': [], 'hits': 0, } search_kwargs = self.build_search_kwargs(query_string, **kwargs) try: raw_results = self.conn.search(query_string, **search_kwargs) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error("Failed to query Solr using '%s': %s", query_string, e) raw_results = EmptyResults() return self._process_results(raw_results, highlight=kwargs.get('highlight'), result_class=kwargs.get('result_class', SearchResult), distance_point=kwargs.get('distance_point')) def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, stats=None): kwargs = {'fl': '* score'} if fields: if isinstance(fields, (list, set)): fields = " ".join(fields) kwargs['fl'] = fields if sort_by is not None: if sort_by in ['distance asc', 'distance desc'] and distance_point: # Do the geo-enabled sort. lng, lat = distance_point['point'].get_coords() kwargs['sfield'] = distance_point['field'] kwargs['pt'] = '%s,%s' % (lat, lng) if sort_by == 'distance asc': kwargs['sort'] = 'geodist() asc' else: kwargs['sort'] = 'geodist() desc' else: if sort_by.startswith('distance '): warnings.warn("In order to sort by distance, you must call the '.distance(...)' method.") # Regular sorting. kwargs['sort'] = sort_by if start_offset is not None: kwargs['start'] = start_offset if end_offset is not None: kwargs['rows'] = end_offset - start_offset if highlight is True: kwargs['hl'] = 'true' kwargs['hl.fragsize'] = '200' if self.include_spelling is True: kwargs['spellcheck'] = 'true' kwargs['spellcheck.collate'] = 'true' kwargs['spellcheck.count'] = 1 if spelling_query: kwargs['spellcheck.q'] = spelling_query if facets is not None: kwargs['facet'] = 'on' kwargs['facet.field'] = facets.keys() for facet_field, options in facets.items(): for key, value in options.items(): kwargs['f.%s.facet.%s' % (facet_field, key)] = self.conn._from_python(value) if date_facets is not None: kwargs['facet'] = 'on' kwargs['facet.date'] = date_facets.keys() kwargs['facet.date.other'] = 'none' for key, value in date_facets.items(): kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date')) kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date')) gap_by_string = value.get('gap_by').upper() gap_string = "%d%s" % (value.get('gap_amount'), gap_by_string) if value.get('gap_amount') != 1: gap_string += "S" kwargs["f.%s.facet.date.gap" % key] = '+%s/%s' % (gap_string, gap_by_string) if query_facets is not None: kwargs['facet'] = 'on' kwargs['facet.query'] = ["%s:%s" % (field, value) for field, value in query_facets] if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices))) if narrow_queries is not None: kwargs['fq'] = list(narrow_queries) if stats: kwargs['stats'] = "true" for k in stats.keys(): kwargs['stats.field'] = k for facet in stats[k]: kwargs['f.%s.stats.facet' % k] = facet if within is not None: from haystack.utils.geo import generate_bounding_box kwargs.setdefault('fq', []) ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(within['point_1'], within['point_2']) # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT* # very clear on this. bbox = '%s:[%s,%s TO %s,%s]' % (within['field'], min_lat, min_lng, max_lat, max_lng) kwargs['fq'].append(bbox) if dwithin is not None: kwargs.setdefault('fq', []) lng, lat = dwithin['point'].get_coords() geofilt = '{!geofilt pt=%s,%s sfield=%s d=%s}' % (lat, lng, dwithin['field'], dwithin['distance'].km) kwargs['fq'].append(geofilt) # Check to see if the backend should try to include distances # (Solr 4.X+) in the results. if self.distance_available and distance_point: # In early testing, you can't just hand Solr 4.X a proper bounding box # & request distances. To enable native distance would take calculating # a center point & a radius off the user-provided box, which kinda # sucks. We'll avoid it for now, since Solr 4.x's release will be some # time yet. # kwargs['fl'] += ' _dist_:geodist()' pass return kwargs def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): from haystack import connections # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model index = connections[self.connection_alias].get_unified_index().get_index(model_klass) field_name = index.get_content_field() params = { 'fl': '*,score', } if start_offset is not None: params['start'] = start_offset if end_offset is not None: params['rows'] = end_offset narrow_queries = set() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices))) if additional_query_string: narrow_queries.add(additional_query_string) if narrow_queries: params['fq'] = list(narrow_queries) query = "%s:%s" % (ID, get_identifier(model_instance)) try: raw_results = self.conn.more_like_this(query, field_name, **params) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error("Failed to fetch More Like This from Solr for document '%s': %s", query, e) raw_results = EmptyResults() return self._process_results(raw_results, result_class=result_class) def _process_results(self, raw_results, highlight=False, result_class=None, distance_point=None): from haystack import connections results = [] hits = raw_results.hits facets = {} stats = {} spelling_suggestion = None if result_class is None: result_class = SearchResult if hasattr(raw_results,'stats'): stats = raw_results.stats.get('stats_fields',{}) if hasattr(raw_results, 'facets'): facets = { 'fields': raw_results.facets.get('facet_fields', {}), 'dates': raw_results.facets.get('facet_dates', {}), 'queries': raw_results.facets.get('facet_queries', {}), } for key in ['fields']: for facet_field in facets[key]: # Convert to a two-tuple, as Solr's json format returns a list of # pairs. facets[key][facet_field] = list(zip(facets[key][facet_field][::2], facets[key][facet_field][1::2])) if self.include_spelling is True: if hasattr(raw_results, 'spellcheck'): if len(raw_results.spellcheck.get('suggestions', [])): # For some reason, it's an array of pairs. Pull off the # collated result from the end. spelling_suggestion = raw_results.spellcheck.get('suggestions')[-1] unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for raw_result in raw_results.docs: app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = haystack_get_model(app_label, model_name) if model and model in indexed_models: index = unified_index.get_index(model) index_field_map = index.field_map for key, value in raw_result.items(): string_key = str(key) # re-map key if alternate name used if string_key in index_field_map: string_key = index_field_map[key] if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self.conn._to_python(value) del(additional_fields[DJANGO_CT]) del(additional_fields[DJANGO_ID]) del(additional_fields['score']) if raw_result[ID] in getattr(raw_results, 'highlighting', {}): additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]] if distance_point: additional_fields['_point_of_origin'] = distance_point if raw_result.get('__dist__'): from haystack.utils.geo import Distance additional_fields['_distance'] = Distance(km=float(raw_result['__dist__'])) else: additional_fields['_distance'] = None result = result_class(app_label, model_name, raw_result[DJANGO_ID], raw_result['score'], **additional_fields) results.append(result) else: hits -= 1 return { 'results': results, 'hits': hits, 'stats': stats, 'facets': facets, 'spelling_suggestion': spelling_suggestion, } def build_schema(self, fields): content_field_name = '' schema_fields = [] for field_name, field_class in fields.items(): field_data = { 'name': field_class.index_fieldname, 'type': 'text_en', 'indexed': 'true', 'stored': 'true', 'multiValued': 'false', } if field_class.document is True: content_field_name = field_class.index_fieldname # DRL_FIXME: Perhaps move to something where, if none of these # checks succeed, call a custom method on the form that # returns, per-backend, the right type of storage? if field_class.field_type in ['date', 'datetime']: field_data['type'] = 'date' elif field_class.field_type == 'integer': field_data['type'] = 'long' elif field_class.field_type == 'float': field_data['type'] = 'float' elif field_class.field_type == 'boolean': field_data['type'] = 'boolean' elif field_class.field_type == 'ngram': field_data['type'] = 'ngram' elif field_class.field_type == 'edge_ngram': field_data['type'] = 'edge_ngram' elif field_class.field_type == 'location': field_data['type'] = 'location' if field_class.is_multivalued: field_data['multiValued'] = 'true' if field_class.stored is False: field_data['stored'] = 'false' # Do this last to override `text` fields. if field_class.indexed is False: field_data['indexed'] = 'false' # If it's text and not being indexed, we probably don't want # to do the normal lowercase/tokenize/stemming/etc. dance. if field_data['type'] == 'text_en': field_data['type'] = 'string' # If it's a ``FacetField``, make sure we don't postprocess it. if hasattr(field_class, 'facet_for'): # If it's text, it ought to be a string. if field_data['type'] == 'text_en': field_data['type'] = 'string' schema_fields.append(field_data) return (content_field_name, schema_fields) def extract_file_contents(self, file_obj): """Extract text and metadata from a structured file (PDF, MS Word, etc.) Uses the Solr ExtractingRequestHandler, which is based on Apache Tika. See the Solr wiki for details: http://wiki.apache.org/solr/ExtractingRequestHandler Due to the way the ExtractingRequestHandler is implemented it completely replaces the normal Haystack indexing process with several unfortunate restrictions: only one file per request, the extracted data is added to the index with no ability to modify it, etc. To simplify the process and allow for more advanced use we'll run using the extract-only mode to return the extracted data without adding it to the index so we can then use it within Haystack's normal templating process. Returns None if metadata cannot be extracted; otherwise returns a dictionary containing at least two keys: :contents: Extracted full-text content, if applicable :metadata: key:value pairs of text strings """ try: return self.conn.extract(file_obj) except Exception as e: self.log.warning(u"Unable to extract file contents: %s", e, exc_info=True, extra={"data": {"file": file_obj}}) return None
class DocManager(DocManagerBase): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): """Verify Solr URL and establish a connection. """ self.solr = Solr(url) self.unique_key = unique_key # pysolr does things in milliseconds if auto_commit_interval is not None: self.auto_commit_interval = auto_commit_interval * 1000 else: self.auto_commit_interval = None self.chunk_size = chunk_size self.field_list = [] self._build_fields() self._formatter = DocumentFlattener() def _parse_fields(self, result, field_name): """ If Schema access, parse fields and build respective lists """ field_list = [] for key, value in result.get('schema', {}).get(field_name, {}).items(): if key not in field_list: field_list.append(key) return field_list @wrap_exceptions def _build_fields(self): """ Builds a list of valid fields """ declared_fields = self.solr._send_request('get', ADMIN_URL) result = decoder.decode(declared_fields) self.field_list = self._parse_fields(result, 'fields') # Build regular expressions to match dynamic fields. # dynamic field names may have exactly one wildcard, either at # the beginning or the end of the name self._dynamic_field_regexes = [] for wc_pattern in self._parse_fields(result, 'dynamicFields'): if wc_pattern[0] == "*": self._dynamic_field_regexes.append( re.compile(".*%s\Z" % wc_pattern[1:])) elif wc_pattern[-1] == "*": self._dynamic_field_regexes.append( re.compile("\A%s.*" % wc_pattern[:-1])) def _clean_doc(self, doc): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # Translate the _id field to whatever unique key we're using. # _id may not exist in the doc, if we retrieved it from Solr # as part of update. if '_id' in doc: doc[self.unique_key] = str(doc.pop("_id")) # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key flat_doc = self._formatter.format_document(doc) # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any( regex.match(field) for regex in self._dynamic_field_regexes) return dict( (k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc def stop(self): """ Stops the instance """ pass def apply_update(self, doc, update_spec): """Override DocManagerBase.apply_update to have flat documents.""" # Replace a whole document if not '$set' in update_spec and not '$unset' in update_spec: # update spec contains the new document update_spec['_ts'] = doc['_ts'] update_spec['ns'] = doc['ns'] update_spec['_id'] = doc['_id'] return update_spec for to_set in update_spec.get("$set", []): value = update_spec['$set'][to_set] # Find dotted-path to the value, remove that key from doc, then # put value at key: keys_to_pop = [] for key in doc: if key.startswith(to_set): if key == to_set or key[len(to_set)] == '.': keys_to_pop.append(key) for key in keys_to_pop: doc.pop(key) doc[to_set] = value for to_unset in update_spec.get("$unset", []): # MongoDB < 2.5.2 reports $unset for fields that don't exist within # the document being updated. keys_to_pop = [] for key in doc: if key.startswith(to_unset): if key == to_unset or key[len(to_unset)] == '.': keys_to_pop.append(key) for key in keys_to_pop: doc.pop(key) return doc @wrap_exceptions def update(self, doc, update_spec): """Apply updates given in update_spec to the document whose id matches that of doc. """ # Commit outstanding changes so that the document to be updated is the # same version to which the changes apply. self.commit() query = "%s:%s" % (self.unique_key, str(doc['_id'])) results = self.solr.search(query) if not len(results): # Document may not be retrievable yet self.commit() results = self.solr.search(query) # Results is an iterable containing only 1 result for doc in results: updated = self.apply_update(doc, update_spec) # A _version_ of 0 will always apply the update updated['_version_'] = 0 self.upsert(updated) return updated @wrap_exceptions def upsert(self, doc): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ if self.auto_commit_interval is not None: self.solr.add([self._clean_doc(doc)], commit=(self.auto_commit_interval == 0), commitWithin=str(self.auto_commit_interval)) else: self.solr.add([self._clean_doc(doc)], commit=False) @wrap_exceptions def bulk_upsert(self, docs): """Update or insert multiple documents into Solr docs may be any iterable """ if self.auto_commit_interval is not None: add_kwargs = { "commit": (self.auto_commit_interval == 0), "commitWithin": str(self.auto_commit_interval) } else: add_kwargs = {"commit": False} cleaned = (self._clean_doc(d) for d in docs) if self.chunk_size > 0: batch = list(next(cleaned) for i in range(self.chunk_size)) while batch: self.solr.add(batch, **add_kwargs) batch = list(next(cleaned) for i in range(self.chunk_size)) else: self.solr.add(cleaned, **add_kwargs) @wrap_exceptions def remove(self, doc): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ self.solr.delete(id=str(doc["_id"]), commit=(self.auto_commit_interval == 0)) @wrap_exceptions def _remove(self): """Removes everything """ self.solr.delete(q='*:*', commit=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, query): """Helper method for iterating over Solr search results.""" for doc in self.solr.search(query, rows=100000000): if self.unique_key != "_id": doc["_id"] = doc.pop(self.unique_key) yield doc @wrap_exceptions def search(self, start_ts, end_ts): """Called to query Solr for documents in a time range.""" query = '_ts: [%s TO %s]' % (start_ts, end_ts) return self._stream_search(query) @wrap_exceptions def _search(self, query): """For test purposes only. Performs search on Solr with given query Does not have to be implemented. """ return self._stream_search(query) def commit(self): """This function is used to force a commit. """ retry_until_ok(self.solr.commit) @wrap_exceptions def get_last_doc(self): """Returns the last document stored in the Solr engine. """ #search everything, sort by descending timestamp, return 1 row try: result = self.solr.search('*:*', sort='_ts desc', rows=1) except ValueError: return None for r in result: r['_id'] = r.pop(self.unique_key) return r