def oralookup( pid=None, uuid=None, fields_to_return="f_name, f_subject, f_keyphrase, faculty, f_institution, thesis_type, content_type, collection", ): s = SolrConnection("ora.ouls.ox.ac.uk:8080") results = {} query = "" if pid: pid = "\:".join(pid.split(":")) query = "id:%s" % pid elif uuid: query = "id:uuid\:%s" % uuid else: return results # Running actual query (3 tries, failover) tries = 0 while tries != 3: try: r = s.search(q=query, wt="json", fl=fields_to_return) logger.debug("Solr response: %s" % r) tries = 3 except BadStatusLine: sleep(0.5) tries = tries + 1 try: results = simplejson.loads(r) assert results["response"]["numFound"] == 1 return results["response"]["docs"][0] except ValueError: logger.warn("Couldn't parse json response from Solr endpoint: %s" % r) return {} except AssertionError: logger.warn("Couldn't assert that only a single result was fetched: %s" % results) return {}
def titlelookup(pid): s = SolrConnection("ora.ouls.ox.ac.uk:8080") results = {} query = "" if pid: pid = "\:".join(pid.split(":")) query = "id:%s" % pid elif uuid: query = "id:uuid\:%s" % uuid else: return results # Running actual query (3 tries, failover) tries = 0 while (tries != 3): try: r = s.search(q=query, wt="json", fl="title") logger.debug("Solr response: %s" % r) tries = 3 except BadStatusLine: sleep(0.5) tries = tries + 1 try: results = simplejson.loads(r) assert results['response']['numFound'] == 1 doc = results['response']['docs'][0] return doc['title'] except ValueError: logger.warn("Couldn't parse json response from Solr endpoint: %s" % r) return {} except AssertionError: logger.warn( "Couldn't assert that only a single result was fetched: %s" % results) return {}
def solr_delete(self): """ Remove from solr index """ solr_conn = SolrConnection(settings.SOLR_URL, persistent=False) solr_conn.delete_query('id:%s' % self.id) solr_conn.commit()
def word_matches_for_page(page_id, words): """ Gets a list of pre-analyzed words for a list of words on a particular page. So if you pass in 'manufacturer' you can get back a list like ['Manufacturer', 'manufacturers', 'MANUFACTURER'] etc ... """ solr = SolrConnection(settings.SOLR) # Make sure page_id is of type str, else the following string # operation may result in a UnicodeDecodeError. For example, see # ticket #493 if not isinstance(page_id, str): page_id = str(page_id) ocr_list = ["ocr"] ocr_list.extend(["ocr_%s" % l for l in settings.SOLR_LANGUAGES]) ocrs = " OR ".join([query_join(words, o) for o in ocr_list]) q = "id:%s AND (%s)" % (page_id, ocrs) params = { "hl.snippets": 100, "hl.requireFieldMatch": "true", "hl.maxAnalyzedChars": "102400" } response = solr.query(q, fields=["id"], highlight=ocr_list, **params) if page_id not in response.highlighting: return [] words = set() for ocr in ocr_list: if ocr in response.highlighting[page_id]: for context in response.highlighting[page_id][ocr]: words.update(find_words(context)) return list(words)
def main(): opts, args = parse_opts() logging.basicConfig(filename=opts.log_file, level=logging.DEBUG, format='[%(asctime)s|%(levelname)s|%(name)s|%(threadName)s|%(message)s]') solr = SolrConnection(opts.solr_uri) protocol = LineProtocol() for request in protocol.input(): try: query = build_query(request) if query is None: protocol.output(query_failed(), True) continue log.debug("Running query: " + str(query)) results = solr.search(**query) if results is None: protocol.output({'code' : 400}) continue resp = json.loads(results) ret = { 'code' : 200, 'json' : resp['response'] } protocol.output(ret, True) except Exception: log.exception("Uncaught exception") return 0
def titlelookup(pid): s = SolrConnection("ora.ouls.ox.ac.uk:8080") results = {} query = "" if pid: pid = "\:".join(pid.split(":")) query = "id:%s" % pid elif uuid: query = "id:uuid\:%s" % uuid else: return results # Running actual query (3 tries, failover) tries = 0 while tries != 3: try: r = s.search(q=query, wt="json", fl="title") logger.debug("Solr response: %s" % r) tries = 3 except BadStatusLine: sleep(0.5) tries = tries + 1 try: results = simplejson.loads(r) assert results["response"]["numFound"] == 1 doc = results["response"]["docs"][0] return doc["title"] except ValueError: logger.warn("Couldn't parse json response from Solr endpoint: %s" % r) return {} except AssertionError: logger.warn("Couldn't assert that only a single result was fetched: %s" % results) return {}
def solr_index(self): """ Write out to solr """ solr_conn = SolrConnection(settings.SOLR_URL, persistent=False) solr_conn.add(**self.solr_doc) solr_conn.commit()
def _get_count(self): "Returns the total number of objects, across all pages." if self._count is None: solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? solr_response = solr.query(self._q, fields=['id']) self._count = int(solr_response.results.numFound) return self._count
def word_matches_for_page(page_id, words): """ Gets a list of pre-analyzed words for a list of words on a particular page. So if you pass in 'manufacturer' you can get back a list like ['Manufacturer', 'manufacturers', 'MANUFACTURER'] etc ... """ solr = SolrConnection(settings.SOLR) # Make sure page_id is of type str, else the following string # operation may result in a UnicodeDecodeError. For example, see # ticket #493 if not isinstance(page_id, str): page_id = str(page_id) ocr_list = ['ocr', ] ocr_list.extend(['ocr_%s' % l for l in settings.SOLR_LANGUAGES]) ocrs = ' OR '.join([query_join(words, o) for o in ocr_list]) q = 'id:%s AND (%s)' % (page_id, ocrs) params = {"hl.snippets": 100, "hl.requireFieldMatch": 'true', "hl.maxAnalyzedChars": '102400'} response = solr.query(q, fields=['id'], highlight=ocr_list, **params) if page_id not in response.highlighting: return [] words = set() for ocr in ocr_list: if ocr in response.highlighting[page_id]: for context in response.highlighting[page_id][ocr]: words.update(find_words(context)) return list(words)
def index_evidence(evidence): evidence_medicine_list = [] evidence_medicine = MedicineEvidenceSummary.objects.filter(evidence=evidence.id) for evimed in evidence_medicine: if evimed.medicine.name not in evidence_medicine_list: evidence_medicine_list.append(evimed.medicine.name) # try to create a connection to a solr server and send medicine try: solr = SolrConnection(settings.SOLR_URL) solr.add( id = "evidence-%s-%s" % (evidence.language, evidence.id), type = "evidence", title = evidence.title, description = evidence.description, context = evidence.context, question = evidence.question, link = evidence.link, file = evidence.file, language = evidence.language, evidence_medicine = evidence_medicine_list, ) response = solr.commit() except Exception as ex: return False return True
def oralookup(pid=None, uuid=None, fields_to_return="f_name, f_subject, f_keyphrase, faculty, f_institution, thesis_type, content_type, collection", endpoint="http://ora.ouls.ox.ac.uk:8080/solr/select"): s = SolrConnection(endpoint) results = {} query = "" if pid: pid = "\:".join(pid.split(":")) query = "id:%s" % pid elif uuid: query = "id:uuid\:%s" % uuid else: return results # Running actual query (3 tries, failover) tries = 0 while(tries != 3): try: r = s.query(q = query, fields = fields_to_return) logger.debug("Solr response: %s" % r.header) tries = 3 except BadStatusLine: sleep(0.5) tries = tries + 1 try: assert len(r.results) == 1 return r.results[0] except ValueError: logger.warn("Couldn't parse json response from Solr endpoint: %s" % r) return {} except AssertionError: logger.warn("Couldn't assert that only a single result was fetched: %s" % results) return {}
def index_title(title, solr=None): if solr == None: solr = SolrConnection(settings.SOLR) _log.info("indexing title: lccn=%s" % title.lccn) try: solr.add(**title.solr_doc) except Exception as e: _log.exception(e)
def _solr_connection(): from solr import SolrConnection url = config['proactive_disclosure.solr_url'] user = config.get('proactive_disclosure.solr_user') password = config.get('proactive_disclosure.solr_password') if user is not None and password is not None: return SolrConnection(url, http_user=user, http_pass=password) return SolrConnection(url)
def index_title(title, solr=None): if solr==None: solr = SolrConnection(settings.SOLR) _log.info("indexing title: lccn=%s" % title.lccn) try: solr.add(**title.solr_doc) except Exception, e: _log.exception(e)
def make_connection(): from solr import SolrConnection if solr_user is not None and solr_password is not None: return SolrConnection(solr_url, http_user=solr_user, http_pass=solr_password) else: return SolrConnection(solr_url)
def _solr_connection(): from solr import SolrConnection url = config['ati_summaries.solr_url'] user = config.get('ati_summaries.solr_user') password = config.get('ati_summaries.solr_password') if user is not None and password is not None: return SolrConnection(url, http_user=user, http_pass=password) return SolrConnection(url)
def index_title(title, solr=None): if solr is None: solr = SolrConnection(settings.SOLR) LOGGER.info("indexing title: lccn=%s", title.lccn) try: solr.add(**title.solr_doc) except Exception as e: LOGGER.exception(e)
def handle(self, *args, **options): self.stdout.write("Optimizing Solr index %s" % settings.SOLR) solr = SolrConnection(settings.SOLR) start_time = default_timer() solr.optimize() elapsed = default_timer() - start_time self.stdout.write("Solr took %0.3f seconds to optimize %s" % (elapsed, settings.SOLR))
def __init__(self, process_ocr=True, process_coordinates=True): """Create a BatchLoader. The process_ocr parameter is used (mainly in testing) when we don't want to spend time actually extracting ocr text and indexing. """ self.PROCESS_OCR = process_ocr self.solr = SolrConnection(settings.SOLR) self.PROCESS_COORDINATES = process_coordinates
def get_page_text(page): no_text = ["Text not available"] solr = SolrConnection(settings.SOLR) query = 'id:"%s"' % page.url solr_results = solr.query(query) results_attribute = getattr(solr_results, 'results', None) if isinstance(results_attribute, list) and len(results_attribute) > 0: return results_attribute[0].get('ocr', no_text) else: return no_text
def index_title(title, solr=None): if solr is None: solr = SolrConnection(settings.SOLR) LOGGER.debug("indexing title: lccn=%s", title.lccn) try: solr.add(**title.solr_doc) except Exception: LOGGER.exception("Unable to index title %s", title)
def make_connection(): from solr import SolrConnection solr_url, solr_user, solr_password = SolrSettings.get() assert solr_url is not None if solr_user is not None and solr_password is not None: return SolrConnection(solr_url, http_user=solr_user, http_pass=solr_password) else: return SolrConnection(solr_url)
def solrconn(self, core='books'): try: solr_base_and_core = "%s/%s" % (solr_base, core) self.conn = SolrConnection( host=solr_host, solrBase=solr_base_and_core, \ username=solr_uname, password=solr_pswd ) self.connstatus = True return True except: #print "solr connection error!" return False
def similar_pages(page): solr = SolrConnection(settings.SOLR) d = page.issue.date_issued year, month, day = '{0:02d}'.format(d.year), '{0:02d}'.format(d.month), '{0:02d}'.format(d.day) date = ''.join(map(str, (year, month, day))) query = '+type:page AND date:%s AND %s AND NOT(lccn:%s)' % (date, query_join(map(lambda p: p.city, page.issue.title.places.all()), 'city'), page.issue.title.lccn) response = solr.query(query, rows=25) results = response.results return map(lambda kwargs: utils.get_page(**kwargs), map(lambda r: urlresolvers.resolve(r['id']).kwargs, results))
class Command(BaseCommand): user_option = optparse.make_option( '--user', action='store', dest='user', help='name of user whose entries to purge') option_list = BaseCommand.option_list + (user_option, ) help = "index all or user-specific entries in solr" args = 'an optional username' def handle(self, *args, **options): self.solr = SolrConnection(SOLR_URL) self.cursor = connection.cursor() if options['user']: print "indexing user" self.index_entries(user=options['user']) else: print 'indexing everything' self.index_entries() print 'committing' self.solr.commit() print 'optimizing' self.solr.optimize() def index_entries(self, user=''): counter = 0 entries = m.Entry.objects.all() if user: entries = entries.filter(user__username=user) docs = [] print 'entry count:', entries.count() SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY slices = [x for x in range(entries.count()) \ if x % SLICE_SIZE == 0] for s in slices: print 'indexing %s to %s...' % (s, s + SLICE_SIZE) entry_slice = entries[s:s + SLICE_SIZE] for entry in entry_slice: counter += 1 docs.append(entry.solr_doc) if len(docs) == MAX_DOCS_PER_ADD: try: self.solr.add_many(docs) except: print 'BAD RECORD:', [d['id'] for d in docs] del (docs) docs = [] reset_queries() if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0: print 'committing at count:', counter self.solr.commit() # Don't miss the leftovers self.solr.add_many(docs)
def delete_title(title, solr=None): if not solr: solr = SolrConnection(settings.SOLR) if isinstance(title, models.Title): title_id = title.url else: title_id = title q = "+type:title +id:%s" % title_id solr.delete_query(q) LOGGER.info("deleted title %s from the index", title)
def handle(self, *args, **options): self.solr = SolrConnection(SOLR_URL) self.cursor = connection.cursor() if options['user']: print "indexing user" self.index_entries(user=options['user']) else: print 'indexing everything' self.index_entries() print 'committing' self.solr.commit() print 'optimizing' self.solr.optimize()
def execute_solr_query(query, fields, sort, sort_order, rows, start): solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? solr_response = solr.query(query, fields=['lccn', 'title', 'edition', 'place_of_publication', 'start_year', 'end_year', 'language'], rows=rows, sort=sort, sort_order=sort_order, start=start) return solr_response
def _refresh(field=None, data=None, path = None, isCron = None): from solr import SolrConnection from ID3 import * s = SolrConnection(SOLR_URL) if path and path != '*': #called by user pathsArr = path.split(',') else: #called from cron pathsArr = folderpaths matches = [] #handles modify, add #deletion will be handled in search when file in solr but not in path time.time() for path in pathsArr: for root, dirnames, filenames in os.walk(path): for extension in ['txt', 'log', 'py', 'pl', 'sql', 'mp3']: for filename in fnmatch.filter(filenames, '*.' + extension): fullName = os.path.join(root, filename) if os.path.getsize(fullName) > 8800000: continue #print fullName if not isCron or (time.time() - os.path.getmtime(fullName) < 24*60*60): try: #data = open(fullName, 'r').read().decode('raw_unicode_escape').replace('\n',' ').replace('\t',' ') if filename.endswith(('.txt', '.log', '.py', '.pl', '.sql')): data = open(fullName, 'r').read() data = filterTxt(data) else: audiofile = ID3(fullName) audiofilekeys = audiofile.keys() if 'TITLE' in audiofilekeys: data = audiofile['TITLE'] + " " if 'ARTIST' in audiofilekeys: data += audiofile['ARTIST'] + " " if 'ALBUM' in audiofilekeys: data += audiofile['ALBUM'] + " " if not data: data = '' data = data.strip() fullName = filterTxt(fullName) filename = filterTxt(filename) s.add(id = fullName, name = filename, txt = data) s.commit() except: pass #print data #print traceback.format_exc() #print fullName #sys.exit() gc.collect()
class Command(BaseCommand): user_option = optparse.make_option('--user', action='store', dest='user', help='name of user whose entries to purge') option_list = BaseCommand.option_list + (user_option,) help = "index all or user-specific entries in solr" args = 'an optional username' def handle(self, *args, **options): self.solr = SolrConnection(SOLR_URL) self.cursor = connection.cursor() if options['user']: print "indexing user" self.index_entries(user=options['user']) else: print 'indexing everything' self.index_entries() print 'committing' self.solr.commit() print 'optimizing' self.solr.optimize() def index_entries(self, user=''): counter = 0 entries = m.Entry.objects.all() if user: entries = entries.filter(user__username=user) docs = [] print 'entry count:', entries.count() SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY slices = [x for x in range(entries.count()) \ if x % SLICE_SIZE == 0] for s in slices: print 'indexing %s to %s...' % (s, s+SLICE_SIZE) entry_slice = entries[s:s+SLICE_SIZE] for entry in entry_slice: counter += 1 docs.append(entry.solr_doc) if len(docs) == MAX_DOCS_PER_ADD: try: self.solr.add_many(docs) except: print 'BAD RECORD:', [d['id'] for d in docs] del(docs) docs = [] reset_queries() if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0: print 'committing at count:', counter self.solr.commit() # Don't miss the leftovers self.solr.add_many(docs)
def handle(self, **options): solr = SolrConnection(SOLR_URL) if options['user']: solr.delete_query('user:%s' % options['user']) else: solr.delete_query('id:[* TO *]') solr.commit()
def handle(self, **options): solr = SolrConnection(settings.SOLR) if options['batch']: solr.delete_query('batch: %s' % options['batch']) else: solr.delete_query('id:[* TO *]') solr.commit()
def __init__(self, channel_id): self.channel_id = channel_id # Build up a Solr query filters = [] filters.append('type:request') filters.append('channel_id:%s' % channel_id) # Make the request to Solr solr = SolrConnection(settings.SOLR_URL) response = solr.select(q = ' AND '.join(filters), rows = 10, fields = 'datetime, id', sort = 'datetime', sort_order = 'asc') self.requests = response.results
def index_titles(since=None): """index all the titles and holdings that are modeled in the database if you pass in a datetime object as the since parameter only title records that have been created since that time will be indexed. """ cursor = connection.cursor() solr = SolrConnection(settings.SOLR) if since: cursor.execute("SELECT lccn FROM core_title WHERE created >= '%s'" % since) else: solr.delete_query('type:title') cursor.execute("SELECT lccn FROM core_title") count = 0 while True: row = cursor.fetchone() if row == None: break title = models.Title.objects.get(lccn=row[0]) index_title(title, solr) count += 1 if count % 100 == 0: _log.info("indexed %s titles" % count) reset_queries() solr.commit() solr.commit()
def execute_solr_query(query, fields, sort, sort_order, rows, start): # default arg_separator - underscore wont work if fields to facet on # themselves have underscore in them solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? solr_response = solr.query(query, fields=[ 'lccn', 'title', 'edition', 'place_of_publication', 'start_year', 'end_year', 'language' ], rows=rows, sort=sort, sort_order=sort_order, start=start) return solr_response
def index_missing_pages(): """ index all pages that are missing from solr in the database """ solr = SolrConnection(settings.SOLR) count = 0 pages = models.Page.objects.filter(indexed=False).all() number_of_pages = len(pages) for page in pages: LOGGER.info("[%s of %s] indexing page: %s", count, number_of_pages, page.url) solr.add(**page.solr_doc) count += 1 page.indexed = True page.save() solr.commit()
def execute_solr_query(query, fields, sort, sort_order, rows, start): # default arg_separator - underscore wont work if fields to facet on # themselves have underscore in them solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? solr_response = solr.query(query, fields=['lccn', 'title', 'edition', 'place_of_publication', 'start_year', 'end_year', 'language'], rows=rows, sort=sort, sort_order=sort_order, start=start) return solr_response
def page(self, number): """ Override the page method in Paginator since Solr has already paginated stuff for us. """ number = self.validate_number(number) # figure out the solr query and execute it solr = SolrConnection( settings.SOLR) # TODO: maybe keep connection around? start = self.per_page * (number - 1) params = { "hl.snippets": 100, # TODO: make this unlimited "hl.requireFieldMatch": 'true', # limits highlighting slop "hl.maxAnalyzedChars": '102400', # increased from default 51200 } sort_field, sort_order = _get_sort(self.query.get('sort'), in_pages=True) solr_response = solr.query(self._q, fields=[ 'id', 'title', 'date', 'sequence', 'edition_label', 'section_label' ], highlight=self._ocr_list, rows=self.per_page, sort=sort_field, sort_order=sort_order, start=start, **params) pages = [] for result in solr_response.results: page = models.Page.lookup(result['id']) if not page: continue words = set() coords = solr_response.highlighting[result['id']] for ocr in self._ocr_list: for s in coords.get(ocr) or []: words.update(find_words(s)) page.words = sorted(words, key=lambda v: v.lower()) page.highlight_url = self.highlight_url(page.url, page.words, number, len(pages)) pages.append(page) return Page(pages, number, self)
def finished(self, **kwargs): source_id = kwargs['source_id'] track_id = kwargs['track_id'] # Build up a Solr query filters = [] filters.append('type:request') filters.append('channel_id:%s' % self.channel_id) filters.append('request_source_id:%s' % source_id) filters.append('request_track_id:%s' % track_id) # Make the request to Solr solr = SolrConnection(settings.SOLR_URL) solr.delete_query(' AND '.join(filters)) solr.commit()
def index_titles(since=None): """index all the titles and holdings that are modeled in the database if you pass in a datetime object as the since parameter only title records that have been created since that time will be indexed. """ cursor = connection.cursor() solr = SolrConnection(settings.SOLR) if since: cursor.execute("SELECT lccn FROM core_title WHERE created >= '%s'" % since) else: solr.delete_query('type:title') cursor.execute("SELECT lccn FROM core_title") count = 0 while True: row = cursor.fetchone() if row is None: break title = models.Title.objects.get(lccn=row[0]) index_title(title, solr) count += 1 if count % 100 == 0: LOGGER.info("indexed %s titles", count) reset_queries() solr.commit() solr.commit()
def handle(self, **options): solr = SolrConnection(SOLR_URL) if options["user"]: solr.delete_query("user:%s" % options["user"]) else: solr.delete_query("id:[* TO *]") solr.commit()
def build_index(**kwargs): """ gets product/sku information from cps DB and indexes them in solr existing solr index is wiped before indexing. Revisit if this strategy does not work """ # index status log message granularity log_index_status_chunks = 25000 solr = SolrConnection(settings.SOLR) clear_index(solr=solr) count = 0 fieldnames = ( "name", "id", "description", "long_description", "age", "gender", "brand", "str_brand", "merchant", "str_merchant", "category", "str_category", "price", "sale_price", "buy_url", "image", ) start = datetime.now() log.info("Reading product info from the database.....") products = db.get_cps_data() log.info("Building SOLR index.....") for product in products: try: product_record = dict(zip(fieldnames, product)) solr.add(**product_record) count += 1 except Exception, e: log.exception(e) continue if count % log_index_status_chunks == 0: log.info("Indexed %d products in %s" % (count, datetime.now() - start))
def page(self, number): """ Override the page method in Paginator since Solr has already paginated stuff for us. """ number = self.validate_number(number) # figure out the solr query and execute it solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? start = self.per_page * (number - 1) params = {"hl.snippets": 100, # TODO: make this unlimited "hl.requireFieldMatch": 'true', # limits highlighting slop "hl.maxAnalyzedChars": '102400', # increased from default 51200 } sort_field, sort_order = _get_sort(self.query.get('sort'), in_pages=True) solr_response = solr.query(self._q, fields=['id', 'title', 'date', 'sequence', 'edition_label', 'section_label'], highlight=self._ocr_list, rows=self.per_page, sort=sort_field, sort_order=sort_order, start=start, **params) pages = [] for result in solr_response.results: page = models.Page.lookup(result['id']) if not page: continue words = set() coords = solr_response.highlighting[result['id']] for ocr in self._ocr_list: for s in coords.get(ocr) or []: words.update(find_words(s)) page.words = sorted(words, key=lambda v: v.lower()) page.highlight_url = self.highlight_url(page.url, page.words, number, len(pages)) pages.append(page) return Page(pages, number, self)
def __init__(self, query): self.query = query.copy() # figure out the solr query q = title_search(self.query) try: page = int(self.query.get('page')) except: page = 1 try: rows = int(self.query.get('rows')) except: rows = 50 start = rows * (page - 1) # determine sort order sort_field, sort_order = _get_sort(self.query.get('sort')) # execute query solr = SolrConnection( settings.SOLR) # TODO: maybe keep connection around? solr_response = solr.query(q, fields=[ 'lccn', 'title', 'edition', 'place_of_publication', 'start_year', 'end_year', 'language' ], rows=rows, sort=sort_field, sort_order=sort_order, start=start) # convert the solr documents to Title models # could use solr doc instead of going to db, if performance requires it lccns = [d['lccn'] for d in solr_response.results] results = [] for lccn in lccns: try: title = models.Title.objects.get(lccn=lccn) results.append(title) except models.Title.DoesNotExist, e: pass # TODO: log exception
def __init__(self, query): self.query = query.copy() # figure out the solr query q = title_search(self.query) try: page = int(self.query.get('page')) except: page = 1 try: rows = int(self.query.get('rows')) except: rows = 50 start = rows * (page - 1) # determine sort order sort_field, sort_order = _get_sort(self.query.get('sort')) # execute query solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? solr_response = solr.query(q, fields=['lccn', 'title', 'edition', 'place_of_publication', 'start_year', 'end_year', 'language'], rows=rows, sort=sort_field, sort_order=sort_order, start=start) # convert the solr documents to Title models # could use solr doc instead of going to db, if performance requires it lccns = [d['lccn'] for d in solr_response.results] results = [] for lccn in lccns: try: title = models.Title.objects.get(lccn=lccn) results.append(title) except models.Title.DoesNotExist, e: pass # TODO: log exception
def handle(self, batch_name=None, *args, **options): if len(args)!=0: raise CommandError('Usage is purge_batch %s' % self.args) loader = BatchLoader() try: log.info("purging batch '%s'", batch_name) loader.purge_batch(batch_name) if options['optimize']: log.info("optimizing solr") solr = SolrConnection(settings.SOLR) solr.optimize() log.info("optimizing MySQL OCR table") cursor = connection.cursor() cursor.execute("OPTIMIZE TABLE core_ocr") log.info("finished optimizing") except BatchLoaderException, e: log.exception(e) raise CommandError("unable to purge batch. check the purge_batch log for clues")
def handle(self, batch_location=None, *args, **options): if len(args)!=0: raise CommandError('Usage is purge_batch %s' % self.args) loader = BatchLoader() try: log.info("purging batch %s", batch_location) loader.purge_batch(batch_location) if options['optimize']: log.info("optimizing solr") solr = SolrConnection(settings.SOLR) solr.optimize() log.info("optimizing MySQL OCR table") cursor = connection.cursor() cursor.execute("OPTIMIZE TABLE core_ocr") log.info("finished optimizing") except BatchLoaderException, e: log.exception(e) raise CommandError("unable to purge batch. check the purge_batch log for clues")
def __init__(self): # Build up a Solr query filters = [] filters.append("type:channel") # Make the request to Solr solr = SolrConnection(settings.SOLR_URL) response = solr.select( q=" AND ".join(filters), rows=10, fields="datetime, channel_id", sort="channel_id", sort_order="asc" ) # Restore the persisted channels for doc in response.results: channel_id = doc["channel_id"] # Create the channel in the URL hierarchy self.__dict__[channel_id] = ChannelResource.Channel(channel_id)
def solr_connection(ini_prefix): """ Set up solr connection :param ini_prefix: prefix to use in specifying .ini file keys (e.g., ati_summaries to use config setting ati_summaries.solr_url etc.) :ptype ini_prefix: str :return a solr connection from configured URL, user, password settings :rtype object """ from solr import SolrConnection url = config.get('{0:s}.solr_url'.format(ini_prefix)) user = config.get('{0:s}.solr_user'.format(ini_prefix)) password = config.get('{0:s}.solr_password'.format(ini_prefix)) if url is None: raise KeyError('{0:s}.solr_url'.format(ini_prefix)) if user is not None and password is not None: return SolrConnection(url, http_user=user, http_pass=password) return SolrConnection(url)
def get_connection(): """Returns the global Solr connection, or creates one, as required.""" global _solr if _solr: return _solr _solr = SolrConnection(url, http_user=http_user, http_pass=http_pass) return _solr
class MLGBsolr: def __init__(self): self.s_result = () self.conn = None self.connstatus = False self.req = None def solrconn(self, core='books'): try: solr_base_and_core = "%s/%s" % (solr_base, core) self.conn = SolrConnection( host=solr_host, solrBase=solr_base_and_core, \ username=solr_uname, password=solr_pswd ) self.connstatus = True return True except: #print "solr connection error!" return False def solrquery(self, para, core='books'): if self.solrconn(core): try: self.search = self.conn.search(para) self.connstatus = True return True except: #print "solr query failed!" return False def solrresults(self, para, Facet=False, core='books'): if self.solrquery(para, core): rsp = simplejson.loads(self.search) s_numFound = rsp['response'].get('numFound', None) s_docs = rsp['response'].get('docs', None) s_params = rsp['responseHeader'].get('params', None) s_rows = s_params.get('rows', None) s_start = s_params.get('start', None) s_q = s_params.get('q', None) if Facet: s_facet_fields = rsp['facet_counts'].get('facet_fields', None) self.s_result = { 'numFound': s_numFound, 'search_params': s_params, 'query': s_q, 'start': s_start, 'rows': s_rows, 'docs': s_docs } if Facet: self.s_result['facet'] = s_facet_fields def __unicode__(self): return self.s_result
def index(self): # Build up a Solr query filters = [] filters.append('type:request') filters.append('channel_id:%s' % self.channel_id) # Make the request to Solr solr = SolrConnection(settings.SOLR_URL) response = solr.select(q = ' AND '.join(filters), rows = 0, fields = '', facet = 'true', facet_field = 'request_track_id', facet_mincount = 1, facet_limit = 10, facet_sort = 'count') results = [] for track_id in response.facet_counts['facet_fields']['request_track_id'].keys(): source_id = 'grooveshark' # Build up a Solr query filters = [] filters.append('type:track') filters.append('request_source_id:%s' % source_id) filters.append('request_track_id:%s' % track_id) # Make the request to Solr track_response = solr.select(q = ' AND '.join(filters), fields = 'track_artist, track_album, track_title') if len(track_response.results) == 1: results.append({ 'track_artist': track_response.results[0]['track_artist'], 'track_album': track_response.results[0]['track_album'], 'track_title': track_response.results[0]['track_title'], 'request_source_id': source_id, 'request_track_id': track_id, 'votes': response.facet_counts['facet_fields']['request_track_id'][track_id], }) results = sorted(results, key = lambda result : -result['votes']) cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8' return json.dumps({'playlist': results}, ensure_ascii=False, indent=4).encode('utf-8')
def index_titles(since=None): """index all the titles and holdings that are modeled in the database if you pass in a datetime object as the since parameter only title records that have been created since that time will be indexed. """ solr = SolrConnection(settings.SOLR) titles = models.Title.objects.all() if since: titles = titles.filter(created__gte=since) titles = titles.prefetch_related("languages", "alt_titles", "subjects", "notes", "places", "urls", "essays", "country", "holdings") count = 0 for chunk in sliced(titles, 500): docs = [] for title in chunk: try: docs.append(title.solr_doc) except Exception: LOGGER.exception("Unable to index title %s", title) solr.add_many(docs) reset_queries() solr.commit() count += len(chunk) LOGGER.info("indexed %d titles", count) lccns = set(models.Title.objects.values_list("lccn", flat=True)) for result in solr.query("+type:title", fields=["id", "lccn"]): stale_id = result["id"] lccn = result["lccn"] if lccn not in lccns: LOGGER.warning("Removing stale title %s from the search index", stale_id) delete_title(stale_id, solr=solr) solr.commit()
def test_index_pages(self): solr = SolrConnection(settings.SOLR) solr.delete_query('type:page') solr.commit() self.assertEqual(si.page_count(), 0) si.index_pages() self.assertEqual(si.page_count(), 2)