def solr_delete(self): """ Remove from solr index """ solr_conn = SolrConnection(settings.SOLR_URL, persistent=False) solr_conn.delete_query('id:%s' % self.id) solr_conn.commit()
def handle(self, **options): solr = SolrConnection(SOLR_URL) if options['user']: solr.delete_query('user:%s' % options['user']) else: solr.delete_query('id:[* TO *]') solr.commit()
def handle(self, **options): solr = SolrConnection(settings.SOLR) if options['batch']: solr.delete_query('batch: %s' % options['batch']) else: solr.delete_query('id:[* TO *]') solr.commit()
def handle(self, **options): solr = SolrConnection(settings.SOLR) if options['batch']: solr.delete_query('batch: %s' % options['batch']) else: solr.delete_query('id:[* TO *]') solr.commit()
def test_index_pages(self): solr = SolrConnection(settings.SOLR) solr.delete_query('type:page') solr.commit() self.assertEqual(si.page_count(), 0) si.index_pages() self.assertEqual(si.page_count(), 2)
def handle(self, **options): solr = SolrConnection(SOLR_URL) if options["user"]: solr.delete_query("user:%s" % options["user"]) else: solr.delete_query("id:[* TO *]") solr.commit()
def solr_reindex (self): """ Reindex all entries. Used when switching to/from "private" status. """ solr_conn = SolrConnection(settings.SOLR_URL) # Start by deleting 'em all solr_conn.delete_query('user:%s' % self.user.id) entries = Entry.objects.filter(user=self.user) docs = [] # Arbitrary assignment of a constant, here. SLICE_SIZE = 50 slices = [x for x in range(entries.count()) \ if x % SLICE_SIZE == 0] for s in slices: entry_slice = entries[s:s+SLICE_SIZE] for entry in entry_slice: docs.append(entry.solr_doc) if len(docs) == SLICE_SIZE: try: solr_conn.add_many(docs) except: # should log appropriately, huh pass del(docs) docs = [] # Don't miss the leftovers solr_conn.add_many(docs) solr_conn.commit() solr_conn.optimize()
def solr_index(self): """ Write out to solr """ solr_conn = SolrConnection(settings.SOLR_URL, persistent=False) solr_conn.add(**self.solr_doc) solr_conn.commit()
def index_titles(since=None): """index all the titles and holdings that are modeled in the database if you pass in a datetime object as the since parameter only title records that have been created since that time will be indexed. """ cursor = connection.cursor() solr = SolrConnection(settings.SOLR) if since: cursor.execute("SELECT lccn FROM core_title WHERE created >= '%s'" % since) else: solr.delete_query('type:title') cursor.execute("SELECT lccn FROM core_title") count = 0 while True: row = cursor.fetchone() if row == None: break title = models.Title.objects.get(lccn=row[0]) index_title(title, solr) count += 1 if count % 100 == 0: _log.info("indexed %s titles" % count) reset_queries() solr.commit() solr.commit()
def index_titles(since=None): """index all the titles and holdings that are modeled in the database if you pass in a datetime object as the since parameter only title records that have been created since that time will be indexed. """ cursor = connection.cursor() solr = SolrConnection(settings.SOLR) if since: cursor.execute("SELECT lccn FROM core_title WHERE created >= '%s'" % since) else: solr.delete_query('type:title') cursor.execute("SELECT lccn FROM core_title") count = 0 while True: row = cursor.fetchone() if row is None: break title = models.Title.objects.get(lccn=row[0]) index_title(title, solr) count += 1 if count % 100 == 0: LOGGER.info("indexed %s titles", count) reset_queries() solr.commit() solr.commit()
class Command(BaseCommand): user_option = optparse.make_option( '--user', action='store', dest='user', help='name of user whose entries to purge') option_list = BaseCommand.option_list + (user_option, ) help = "index all or user-specific entries in solr" args = 'an optional username' def handle(self, *args, **options): self.solr = SolrConnection(SOLR_URL) self.cursor = connection.cursor() if options['user']: print "indexing user" self.index_entries(user=options['user']) else: print 'indexing everything' self.index_entries() print 'committing' self.solr.commit() print 'optimizing' self.solr.optimize() def index_entries(self, user=''): counter = 0 entries = m.Entry.objects.all() if user: entries = entries.filter(user__username=user) docs = [] print 'entry count:', entries.count() SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY slices = [x for x in range(entries.count()) \ if x % SLICE_SIZE == 0] for s in slices: print 'indexing %s to %s...' % (s, s + SLICE_SIZE) entry_slice = entries[s:s + SLICE_SIZE] for entry in entry_slice: counter += 1 docs.append(entry.solr_doc) if len(docs) == MAX_DOCS_PER_ADD: try: self.solr.add_many(docs) except: print 'BAD RECORD:', [d['id'] for d in docs] del (docs) docs = [] reset_queries() if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0: print 'committing at count:', counter self.solr.commit() # Don't miss the leftovers self.solr.add_many(docs)
class Command(BaseCommand): user_option = optparse.make_option('--user', action='store', dest='user', help='name of user whose entries to purge') option_list = BaseCommand.option_list + (user_option,) help = "index all or user-specific entries in solr" args = 'an optional username' def handle(self, *args, **options): self.solr = SolrConnection(SOLR_URL) self.cursor = connection.cursor() if options['user']: print "indexing user" self.index_entries(user=options['user']) else: print 'indexing everything' self.index_entries() print 'committing' self.solr.commit() print 'optimizing' self.solr.optimize() def index_entries(self, user=''): counter = 0 entries = m.Entry.objects.all() if user: entries = entries.filter(user__username=user) docs = [] print 'entry count:', entries.count() SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY slices = [x for x in range(entries.count()) \ if x % SLICE_SIZE == 0] for s in slices: print 'indexing %s to %s...' % (s, s+SLICE_SIZE) entry_slice = entries[s:s+SLICE_SIZE] for entry in entry_slice: counter += 1 docs.append(entry.solr_doc) if len(docs) == MAX_DOCS_PER_ADD: try: self.solr.add_many(docs) except: print 'BAD RECORD:', [d['id'] for d in docs] del(docs) docs = [] reset_queries() if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0: print 'committing at count:', counter self.solr.commit() # Don't miss the leftovers self.solr.add_many(docs)
def _refresh(field=None, data=None, path = None, isCron = None): from solr import SolrConnection from ID3 import * s = SolrConnection(SOLR_URL) if path and path != '*': #called by user pathsArr = path.split(',') else: #called from cron pathsArr = folderpaths matches = [] #handles modify, add #deletion will be handled in search when file in solr but not in path time.time() for path in pathsArr: for root, dirnames, filenames in os.walk(path): for extension in ['txt', 'log', 'py', 'pl', 'sql', 'mp3']: for filename in fnmatch.filter(filenames, '*.' + extension): fullName = os.path.join(root, filename) if os.path.getsize(fullName) > 8800000: continue #print fullName if not isCron or (time.time() - os.path.getmtime(fullName) < 24*60*60): try: #data = open(fullName, 'r').read().decode('raw_unicode_escape').replace('\n',' ').replace('\t',' ') if filename.endswith(('.txt', '.log', '.py', '.pl', '.sql')): data = open(fullName, 'r').read() data = filterTxt(data) else: audiofile = ID3(fullName) audiofilekeys = audiofile.keys() if 'TITLE' in audiofilekeys: data = audiofile['TITLE'] + " " if 'ARTIST' in audiofilekeys: data += audiofile['ARTIST'] + " " if 'ALBUM' in audiofilekeys: data += audiofile['ALBUM'] + " " if not data: data = '' data = data.strip() fullName = filterTxt(fullName) filename = filterTxt(filename) s.add(id = fullName, name = filename, txt = data) s.commit() except: pass #print data #print traceback.format_exc() #print fullName #sys.exit() gc.collect()
def search(self, **kwargs): query = kwargs['q'] api_key = "aac5b38a36513510000ef3286494fc6d" url = urllib2.urlopen("http://tinysong.com/s/%s/?format=json&key=%s" % (urllib2.quote(query), api_key)) response = json.loads(url.read()) # TODO: Remove redundancy between results and tracks? results = [] tracks = [] for song in response: source_id = 'grooveshark' result = { 'artist': song['ArtistName'], 'album': song['AlbumName'], 'title': song['SongName'], 'sources': [ { 'sourceid': source_id, 'trackid': '%s' % song['SongID'] } ] } results.append(result) track = { 'id': 'track_%s_%s' % (source_id, song['SongID']), 'type': 'track', 'track_title': song['SongName'], 'track_artist': song['ArtistName'], 'track_album': song['AlbumName'], 'request_source_id': source_id, 'request_track_id': song['SongID'], } tracks.append(track) # Register the songs in the search engine solr = SolrConnection(settings.SOLR_URL) solr.add_many(tracks) solr.commit() solr.close() cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8' return json.dumps(results, ensure_ascii=False, indent=4).encode('utf-8')
def index_missing_pages(): """ index all pages that are missing from solr in the database """ solr = SolrConnection(settings.SOLR) count = 0 pages = models.Page.objects.filter(indexed=False).all() number_of_pages = len(pages) for page in pages: LOGGER.info("[%s of %s] indexing page: %s", count, number_of_pages, page.url) solr.add(**page.solr_doc) count += 1 page.indexed = True page.save() solr.commit()
def finished(self, **kwargs): source_id = kwargs['source_id'] track_id = kwargs['track_id'] # Build up a Solr query filters = [] filters.append('type:request') filters.append('channel_id:%s' % self.channel_id) filters.append('request_source_id:%s' % source_id) filters.append('request_track_id:%s' % track_id) # Make the request to Solr solr = SolrConnection(settings.SOLR_URL) solr.delete_query(' AND '.join(filters)) solr.commit()
def index_evidence(evidence): evidence_medicine_list = [] evidence_medicine = MedicineEvidenceSummary.objects.filter(evidence=evidence.id) for evimed in evidence_medicine: if evimed.medicine.name not in evidence_medicine_list: evidence_medicine_list.append(evimed.medicine.name) # try to create a connection to a solr server and send medicine try: solr = SolrConnection(settings.SOLR_URL) solr.add( id = "evidence-%s-%s" % (evidence.language, evidence.id), type = "evidence", title = evidence.title, description = evidence.description, context = evidence.context, question = evidence.question, link = evidence.link, file = evidence.file, language = evidence.language, evidence_medicine = evidence_medicine_list, ) response = solr.commit() except Exception as ex: return False return True
def index_missing_pages(): """ index all pages that are missing from solr in the database """ solr = SolrConnection(settings.SOLR) count = 0 pages = models.Page.objects.filter(indexed=False).all() number_of_pages = len(pages) for page in pages: LOGGER.info("[%s of %s] indexing page: %s", count, number_of_pages, page.url) solr.add(**page.solr_doc) count += 1 page.indexed = True page.save() solr.commit()
def create(self, **kwargs): # Collect the channel details name = kwargs["name"] pos = kwargs["pos"] # Create the channel in the search engine doc = {"id": "channel_%s" % (name,), "type": "channel", "channel_id": name, "channel_location": pos} solr = SolrConnection(settings.SOLR_URL) solr.add_many([doc]) solr.commit() solr.close() # Create the channel in the URL hierarchy self.__dict__[name] = ChannelResource.Channel(name)
def index_titles(since=None): """index all the titles and holdings that are modeled in the database if you pass in a datetime object as the since parameter only title records that have been created since that time will be indexed. """ solr = SolrConnection(settings.SOLR) titles = models.Title.objects.all() if since: titles = titles.filter(created__gte=since) titles = titles.prefetch_related("languages", "alt_titles", "subjects", "notes", "places", "urls", "essays", "country", "holdings") count = 0 for chunk in sliced(titles, 500): docs = [] for title in chunk: try: docs.append(title.solr_doc) except Exception: LOGGER.exception("Unable to index title %s", title) solr.add_many(docs) reset_queries() solr.commit() count += len(chunk) LOGGER.info("indexed %d titles", count) lccns = set(models.Title.objects.values_list("lccn", flat=True)) for result in solr.query("+type:title", fields=["id", "lccn"]): stale_id = result["id"] lccn = result["lccn"] if lccn not in lccns: LOGGER.warning("Removing stale title %s from the search index", stale_id) delete_title(stale_id, solr=solr) solr.commit()
def index_pages(): """index all the pages that are modeled in the database """ solr = SolrConnection(settings.SOLR) solr.delete_query('type:page') cursor = connection.cursor() cursor.execute("SELECT id FROM core_page") count = 0 while True: row = cursor.fetchone() if row is None: break page = models.Page.objects.get(id=row[0]) LOGGER.info("[%s] indexing page: %s", count, page.url) solr.add(**page.solr_doc) count += 1 if count % 100 == 0: reset_queries() solr.commit()
def index_pages(): """index all the pages that are modeled in the database """ solr = SolrConnection(settings.SOLR) solr.delete_query('type:page') cursor = connection.cursor() cursor.execute("SELECT id FROM core_page") count = 0 while True: row = cursor.fetchone() if row is None: break page = models.Page.objects.get(id=row[0]) LOGGER.info("[%s] indexing page: %s", count, page.url) solr.add(**page.solr_doc) count += 1 if count % 100 == 0: reset_queries() solr.commit()
def setUpClass(cls): # First, add a folio to Solr so that the image_uri can be retrieved during the MEI conversion # Using curl here because it turned out to be easier than solrconn.add and gives better error messages os.system("curl {0}/update/?commit=true -H 'Content-Type: text/xml' -d '<add><doc>\ <field name=\"id\">testid</field>\ <field name=\"type\">cantusdata_folio</field>\ <field name=\"manuscript_id\">{1}</field>\ <field name=\"number\">{2}</field>\ <field name=\"image_uri\">{3}</field>\ </doc></add>'".format(settings.SOLR_SERVER, MEI_FIXTURE_ID, MEI_FIXTURE_FOLIO, MEI_FIXTURE_URI)) docs = list(MEIConverter.process_file(MEI_FIXTURE, MEI_FIXTURE_SIGLUM, MEI_FIXTURE_ID)) # Sanity check solrconn = SolrConnection(settings.SOLR_SERVER) prequery = solrconn.query('type:cantusdata_music_notation AND manuscript:' + MEI_FIXTURE_SIGLUM) assert prequery.numFound == 0, 'MEI was already in the database when loading the test fixture' solrconn.add_many(docs) solrconn.commit()
def index_pages(): """index all the pages that are modeled in the database """ _log = logging.getLogger(__name__) solr = SolrConnection(settings.SOLR) cursor = connection.cursor() cursor.execute( "SELECT id FROM core_page WHERE ocr_filename IS NOT NULL AND ocr_filename <> ''" ) count = 0 while True: row = cursor.fetchone() if row == None: break page = models.Page.objects.get(id=row[0]) _log.info("[%s] indexing page: %s" % (count, page.url)) solr.add(**page.solr_doc) count += 1 if count % 100 == 0: reset_queries() solr.commit()
def _send_update(self, *args, **kwargs): """Send an update request to Solr. Solr commits are made only on deletion. Takes a single argument: the AMQP message that was received. """ try: log.info("Processing update request") msg = args[0] updates = json.loads(msg.body) solr = SolrConnection(self.solr_uri) if updates["type"] == "updated": add = ET.Element("add") for update in updates["data"]: doc = ET.SubElement(add, "doc") for fields in update: # There should only be one pair # FIXME: move to a dictionary structure for k, v in fields.items(): SolrUpdater.xml_field(doc, solr.escapeKey(k), solr.escapeVal(v)) log.debug("Sending update to Solr: " + ET.tostring(add)) solr.doUpdateXML(ET.tostring(add)) elif updates["type"] == "deleted": for id in updates["data"]: log.debug("Deleting document with id '%s'" % id) solr.delete(id) solr.commit() elif updates["type"] == "deleted_db": db_name = updates["data"] log.info("Deleting indexes for database '%s'" % db_name) solr.deleteByQuery("_db:%s" % db_name) solr.commit() else: log.warning("Unrecognized update type: '%s'" % updates["type"]) except Exception: log.exception("Unexpected exception")
def tearDownClass(cls): solrconn = SolrConnection(settings.SOLR_SERVER) solrconn.delete_query('type:cantusdata_music_notation AND manuscript:' + MEI_FIXTURE_SIGLUM) solrconn.delete_query('type:cantusdata_folio AND manuscript_id:{0}'.format(MEI_FIXTURE_ID)) solrconn.commit()
# Compose document data to store in Solr. documents = [] for path, fname in txts: log.msg(fname, "->", path) url = site + path with codecs.open(fname, 'rb', encoding) as fp: title, content = parse_document(fp) doc = { 'title': title, 'content': content, #'last_modified': datetime.fromtimestamp(os.path.getmtime(fname)), 'last_modified': datetime.now().replace(tzinfo=utc), 'site': site, 'url': url, 'id': hashlib.sha1(url).hexdigest() } documents.append(doc) u = options['username'] p = options['password'] if u and p: s = SolrConnection(server, http_user=u, http_pass=p) else: s = SolrConnection(server) s.add_many(documents) s.commit() if __name__ == '__main__': main()
class BatchLoader(object): """This class allows you to load a batch into the database. A loader object serves as a context for a particular batch loading job. """ def __init__(self, process_ocr=True, process_coordinates=True): """Create a BatchLoader. The process_ocr parameter is used (mainly in testing) when we don't want to spend time actually extracting ocr text and indexing. """ self.PROCESS_OCR = process_ocr self.solr = SolrConnection(settings.SOLR) self.PROCESS_COORDINATES = process_coordinates def _find_batch_file(self, batch): """ TODO: Who can we toss the requirement at to make this available in a canonical location? """ # look for batch_1.xml, BATCH_1.xml, etc for alias in [ "batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml", "BATCH_2.xml", "batch.xml", ]: # TODO: might we want 'batch.xml' first? Leaving last for now to # minimize impact. url = urlparse.urljoin(batch.storage_url, alias) try: urllib2.urlopen(url) validated_batch_file = alias break except (urllib2.HTTPError, urllib2.URLError): continue else: raise BatchLoaderException( "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?" % batch.path) return validated_batch_file def _sanity_check_batch(self, batch): # if not os.path.exists(batch.path): # raise BatchLoaderException("batch does not exist at %s" % batch.path) # b = urllib2.urlopen(batch.url) batch.validated_batch_file = self._find_batch_file(batch) def load_batch(self, batch_path, strict=True): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ LOGGER.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if batch_path != link_name and not os.path.islink(link_name): LOGGER.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) if not strict: try: batch = Batch.objects.get(name=batch_name) LOGGER.info("Batch already loaded: %s", batch_name) return batch except Batch.DoesNotExist as e: pass LOGGER.info("loading batch: %s", batch_name) event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath("ndnp:reel", namespaces=ns): reel_number = e.attrib["reelNumber"].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist as e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath("ndnp:issue", namespaces=ns): mets_url = urlparse.urljoin(batch.storage_url, e.text) try: issue, pages = self._load_issue(mets_url) except ValueError as e: LOGGER.exception("Unable to load issue from %s", mets_url) continue # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: LOGGER.info("Adding pages to solr index from issue %s", issue.title) for page in pages: LOGGER.debug("indexing ocr for: %s", page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save() if self.PROCESS_OCR: LOGGER.info("Committing solr index") self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count LOGGER.info(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() except Exception as e: msg = "unable to load batch: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception: LOGGER.exception( "Unable to purge batch %s after loading failed", batch_name) raise BatchLoaderException(msg) if settings.IS_PRODUCTION: batch.released = datetime.now() batch.save() cache.delete("newspaper_info") return batch def _get_batch(self, batch_name, batch_source=None, create=False): if create: batch = self._create_batch(batch_name, batch_source) else: batch = Batch.objects.get(name=batch_name) return batch def _create_batch(self, batch_name, batch_source): if Batch.objects.filter(name=batch_name).count() != 0: raise BatchLoaderException("batch %s already loaded" % batch_name) batch = Batch() batch.name = batch_name batch.source = batch_source try: parts = batch_name.split("_", 3) if len(parts) == 4: parts = parts[1:] awardee_org_code, name_part, version = parts batch.awardee = Awardee.objects.get(org_code=awardee_org_code) except Awardee.DoesNotExist: msg = "no awardee for org code: %s" % awardee_org_code LOGGER.error(msg) raise BatchLoaderException(msg) batch.save() return batch def _load_issue(self, mets_file): LOGGER.debug("parsing issue mets file: %s", mets_file) doc = etree.parse(mets_file) # get the mods for the issue div = doc.xpath('.//mets:div[@TYPE="np:issue"]', namespaces=ns)[0] dmdid = div.attrib["DMDID"] mods = dmd_mods(doc, dmdid) # set up a new Issue issue = Issue() issue.volume = mods.xpath( 'string(.//mods:detail[@type="volume"]/mods:number[1])', namespaces=ns).strip() issue.number = mods.xpath( 'string(.//mods:detail[@type="issue"]/mods:number[1])', namespaces=ns).strip() issue.edition = int( mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:number[1])', namespaces=ns)) issue.edition_label = mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:caption[1])', namespaces=ns).strip() # parse issue date date_issued = mods.xpath("string(.//mods:dateIssued)", namespaces=ns) issue.date_issued = datetime.strptime(date_issued, "%Y-%m-%d") # attach the Issue to the appropriate Title lccn = mods.xpath('string(.//mods:identifier[@type="lccn"])', namespaces=ns).strip() try: title = Title.objects.get(lccn=lccn) except title.DoesNotExist: url = "https://chroniclingamerica.loc.gov/lccn/%s/marc.xml" % lccn LOGGER.info("attempting to load MARC record from %s", url) management.call_command("load_titles", url) title = Title.objects.get(lccn=lccn) issue.title = title issue.batch = self.current_batch issue.save() LOGGER.debug("saved issue: %s", issue.url) notes = [] for mods_note in mods.xpath(".//mods:note", namespaces=ns): type = mods_note.xpath("string(./@type)") label = mods_note.xpath("string(./@displayLabel)") text = mods_note.xpath("string(.)") note = models.IssueNote(type=type, label=label, text=text) notes.append(note) issue.notes = notes issue.save() # attach pages: lots of logging because it's expensive pages = [] for page_div in div.xpath('.//mets:div[@TYPE="np:page"]', namespaces=ns): try: pages.append(self._load_page(doc, page_div, issue)) except BatchLoaderException: LOGGER.exception( "Failed to load page. doc: %s, page div: %s, issue: %s", doc, page_div, issue) return issue, pages def _load_page(self, doc, div, issue): dmdid = div.attrib["DMDID"] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath("string(.//mods:extent/mods:start)", namespaces=ns) try: page.sequence = int(seq_string) except ValueError: raise BatchLoaderException( "could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath('string(.//mods:detail[@type="page number"])', namespaces=ns).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: LOGGER.warning("unable to find reel number in page metadata") LOGGER.info("Assigned page sequence: %s", page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue LOGGER.info("Saving page. issue date: %s, page sequence: %s", issue.date_issued, page.sequence) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath(".//mods:note", namespaces=ns): type = mods_note.xpath("string(./@type)") label = mods_note.xpath("string(./@displayLabel)") text = mods_note.xpath("string(.)").strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes = notes # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath("./mets:fptr", namespaces=ns): file_id = fptr.attrib["FILEID"] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib["USE"] # get the filename relative to the storage location file_name = file_el.xpath("string(./mets:FLocat/@xlink:href)", namespaces=ns) file_name = urlparse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == "master": page.tiff_filename = file_name elif file_type == "service": page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib["ADMID"].split(" "): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError: LOGGER.info( "Could not determine dimensions of jp2 for issue: %s page: %s... trying harder...", page.issue, page, ) if not page.jp2_width: raise BatchLoaderException( "No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException( "No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == "derivative": page.pdf_filename = file_name elif file_type == "ocr": page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: page = self.process_ocr(page) else: LOGGER.info("No ocr filename for issue: %s page: %s", page.issue, page) LOGGER.debug("saving page: %s", page.url) page.save() return page def process_ocr(self, page): LOGGER.debug("extracting ocr text and word coords for %s", page.url) url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() lang_text_solr = {} for lang, text in lang_text.iteritems(): try: language = models.Language.objects.get( Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: LOGGER.warning( "Language %s does not exist in the database. Defaulting to English.", lang) # default to english as per requirement language = models.Language.objects.get(code="eng") ocr.language_texts.create(language=language) lang_text_solr[language.code] = text page.ocr = ocr page.lang_text = lang_text_solr page.save() return page def _process_coordinates(self, page, coords): LOGGER.debug("writing out word coords for %s", page.url) # We'll use a temporary file in case the coordinates dir is configured # to a network filesystem which has poor update performance # characteristics fd, path = tempfile.mkstemp(text="w", suffix=".coordinates", dir=settings.TEMP_STORAGE) f = open(path, "w") f.write(gzip_compress(json.dumps(coords))) f.close() os.close(fd) final_path = models.coordinates_path(page._url_parts()) try: shutil.move(path, final_path) except Exception: LOGGER.warning( 'Could not move coordinates to "%s". Waiting 5 seconds before trying again…', final_path) time.sleep(5) shutil.move(path, final_path) def process_coordinates(self, batch_path): LOGGER.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): if not page.ocr_filename: LOGGER.warning( "Batch [%s] page [%s] has no OCR; skipping coordinates processing", batch_name, page, ) else: url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) LOGGER.debug("Extracting OCR from url %s", url) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception as e: msg = "unable to process coordinates for batch: %s" % e LOGGER.exception(msg) raise BatchLoaderException(msg) def storage_relative_path(self, path): """returns a relative path for a given file path within a batch, so that storage can be re-homed without having to rewrite paths in the db """ rel_path = path.replace(self.current_batch.storage_url, "") return rel_path @transaction.atomic def purge_batch(self, batch_name): batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name) except Batch.DoesNotExist: LOGGER.info("Batch %s does not exist", batch_name) return event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): LOGGER.info("Removing symlink %s", link_name) os.remove(link_name) except Exception as e: msg = "purge failed: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg) def _purge_batch(self, batch): batch_name = batch.name # just delete batch causes memory to bloat out # so we do it piece-meal for issue in batch.issues.prefetch_related("pages__issue", "pages__issue__title"): for page in issue.pages.all(): page.delete() # remove coordinates if os.path.exists(models.coordinates_path(page._url_parts())): os.remove(models.coordinates_path(page._url_parts())) issue.delete() batch.delete() if self.PROCESS_OCR: self.solr.delete_query('batch:"%s"' % batch_name) self.solr.commit()
"uuid": [u"78755d851f9a453b84a51b1c00c68553"], "depositor": "zool0982" # 'identifier': ['fri_day1'], # 'aggregatedResource': ['http://datafinder-d2v.bodleian.ox.ac.uk/DataFinder/datasets/fri_day1/df_manifest.rdf'], # 'mediator': ['admin'], # 'text': ['', 'http://vocab.ox.ac.uk/projectfunding#', '', 'seeking_approval', '', '', 'yes', '', ''], # 'depositor': ['zool0982'], # 'embargoedUntilDate': ['2083-06-21T14:08:45Z'], # 'alternative': ['fri_day1'], # 'subject': [''], # 'rights': ['http://ora.ouls.ox.ac.uk/objects/uuid%3A1d00eebb-8fed-46ad-8e38-45dbdb4b224c'], # 'publisher': ['Bodleian Libraries, University of Oxford'], # 'license': ['CC0 1.0 Universal (CC0 1.0). See http://creativecommons.org/publicdomain/zero/1.0/legalcode'], # 'language': [''], # 'title': ['fri_day1'], # 'embargoStatus': ['True'], # 'description': [''], # 'format': [''], # 'modified': ['2013-06-21 14:08:45.525602'], # 'currentVersion': ['2'], # 'created': ['2013-06-21 14:08:45.253033'], # 'issued': [''], # 'type': ['', 'http://vocab.ox.ac.uk/dataset/schema#DataSet'] } # solr_doc = {'identifier': ['fri_day1'], 'aggregatedResource': ['http://datafinder-d2v.bodleian.ox.ac.uk/DataFinder/datasets/fri_day1/df_manifest.rdf'], 'mediator': ['admin'], 'text': ['', '', '', 'http://vocab.ox.ac.uk/projectfunding#', '', 'yes', '', 'seeking_approval', ''], 'depositor': ['zool0982'], 'embargoedUntilDate': ['2083-06-21T14:08:45Z'], 'alternative': ['fri_day1'], 'subject': [''], 'rights': ['http://ora.ouls.ox.ac.uk/objects/uuid%3A1d00eebb-8fed-46ad-8e38-45dbdb4b224c'], 'publisher': ['', 'Bodleian Libraries, University of Oxford'], 'license': ['CC0 1.0 Universal (CC0 1.0). See http://creativecommons.org/publicdomain/zero/1.0/legalcode'], 'uuid': [u'4fb84512bfaf4927945ea3c241bf21c0'], 'language': [''], 'title': ['fri_day1'], 'embargoStatus': ['True'], 'description': [''], 'format': [''], 'modified': ['2013-06-21 14:08:45.525602'], 'id': ['fri_day1'], 'currentVersion': ['2'], 'created': ['2013-06-21 14:08:45.253033'], 'issued': [''], 'silo': ['DataFinder'], 'type': ['http://vocab.ox.ac.uk/dataset/schema#DataSet', '']} # solr_doc = {'identifier': ['mond_ay2'], 'aggregatedResource': ['http://datafinder-d2v.bodleian.ox.ac.uk/DataFinder/datasets/mond_ay2/df_manifest.rdf'], 'mediator': ['admin'], 'text': ['', 'http://vocab.ox.ac.uk/projectfunding#', '', 'seeking_approval', '', 'yes', '', '', ''], 'depositor': 'zool0982', 'alternative': ['mond_ay2'], 'embargoedUntilDate': ['2083-06-24T03:41:53Z'], 'subject': [''], 'rights': ['http://ora.ouls.ox.ac.uk/objects/uuid%3A1d00eebb-8fed-46ad-8e38-45dbdb4b224c'], 'publisher': ['Bodleian Libraries, University of Oxford'], 'license': ['CC0 1.0 Universal (CC0 1.0). See http://creativecommons.org/publicdomain/zero/1.0/legalcode'], 'uuid': [u'78755d851f9a453b84a51b1c00c68553'], 'language': [''], 'title': ['mond_ay2'], 'embargoStatus': ['True'], 'description': ['mond_ay2'], 'format': [''], 'modified': ['2013-06-24 03:41:53.988847'], 'id': ['mond_ay2'], 'currentVersion': ['2'], 'created': ['2013-06-24 03:41:53.618090'], 'issued': [''], 'silo': ['DataFinder'], 'type': ['', 'http://vocab.ox.ac.uk/dataset/schema#DataSet']} # print repr(solr_doc) solr.add(_commit=True, **solr_doc) solr.commit()
def commit(): solr = SolrConnection(settings.SOLR) solr.commit()
def handle(self, **options): solr = SolrConnection(settings.SOLR) solr.commit()
def commit(): solr = SolrConnection(settings.SOLR) solr.commit()
class BatchLoader(object): """This class allows you to load a batch into the database. A loader object serves as a context for a particular batch loading job. """ def __init__(self, process_ocr=True, process_coordinates=True): """Create a BatchLoader. The process_ocr parameter is used (mainly in testing) when we don't want to spend time actually extracting ocr text and indexing. """ self.PROCESS_OCR = process_ocr if self.PROCESS_OCR: self.solr = SolrConnection(settings.SOLR) self.PROCESS_COORDINATES = process_coordinates def _find_batch_file(self, batch): """ TODO: Who can we toss the requirement at to make this available in a canonical location? """ # look for batch_1.xml, BATCH_1.xml, etc for alias in [ "batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml", "BATCH_2.xml", "batch.xml" ]: # TODO: might we want 'batch.xml' first? Leaving last for now to # minimize impact. url = urllib.parse.urljoin(batch.storage_url, alias) try: u = urllib.request.urlopen(url) validated_batch_file = alias break except urllib.error.HTTPError as e: continue except urllib.error.URLError as e: continue else: raise BatchLoaderException( "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?" % batch.path) return validated_batch_file def _sanity_check_batch(self, batch): #if not os.path.exists(batch.path): # raise BatchLoaderException("batch does not exist at %s" % batch.path) #b = urllib2.urlopen(batch.url) batch.validated_batch_file = self._find_batch_file(batch) def load_batch(self, batch_path): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ self.pages_processed = 0 # Trailing slash breaks comparison to link_name below, so strip off batch_path = batch_path.rstrip("/") _logger.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) # Create symlink if paths don't match, symlink not already there, # and batch_path wasn't input with a BATCH_STORAGE symlink path if (batch_path != link_name and not os.path.islink(link_name) and not (os.path.islink(settings.BATCH_STORAGE) and batch_path.startswith( os.path.realpath(settings.BATCH_STORAGE)))): _logger.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = urllib.parse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = Batch.objects.get(name=batch_name) _logger.info("Batch already loaded: %s" % batch_name) return batch except Batch.DoesNotExist as e: pass _logger.info("loading batch: %s" % batch_name) t0 = time() times = [] event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns): reel_number = e.attrib['reelNumber'].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist as e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath('ndnp:issue', namespaces=ns): mets_url = urllib.parse.urljoin(batch.storage_url, e.text) try: issue = self._load_issue(mets_url) except ValueError as e: _logger.exception(e) continue reset_queries() times.append((time() - t0, self.pages_processed)) # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count event = LoadBatchEvent(batch_name=batch_name, message=msg) _logger.info(msg) event.save() except Exception as e: msg = "unable to load batch: %s" % e _logger.error(msg) _logger.exception(e) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception as pbe: _logger.error("purge batch failed for failed load batch: %s" % pbe) _logger.exception(pbe) raise BatchLoaderException(msg) # updates the min and max years of all titles set_fulltext_range() return batch def _get_batch(self, batch_name, batch_source=None, create=False): if create: batch = self._create_batch(batch_name, batch_source) else: batch = Batch.objects.get(name=batch_name) return batch def _create_batch(self, batch_name, batch_source): if Batch.objects.filter(name=batch_name).count() != 0: raise BatchLoaderException("batch %s already loaded" % batch_name) batch = Batch() batch.name = batch_name batch.source = batch_source try: _, org_code, name_part, version = batch_name.split("_", 3) awardee_org_code = org_code batch.awardee = Awardee.objects.get(org_code=awardee_org_code) except Awardee.DoesNotExist as e: msg = "no awardee for org code: %s" % awardee_org_code _logger.error(msg) raise BatchLoaderException(msg) batch.save() return batch def _load_issue(self, mets_file): _logger.debug("parsing issue mets file: %s" % mets_file) doc = etree.parse(mets_file) # get the mods for the issue div = doc.xpath('.//mets:div[@TYPE="np:issue"]', namespaces=ns)[0] dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) # set up a new Issue issue = Issue() issue.volume = mods.xpath( 'string(.//mods:detail[@type="volume"]/mods:number[1])', namespaces=ns).strip() issue.number = mods.xpath( 'string(.//mods:detail[@type="issue"]/mods:number[1])', namespaces=ns).strip() issue.edition = int( mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:number[1])', namespaces=ns)) issue.edition_label = mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:caption[1])', namespaces=ns).strip() # parse issue date date_issued = mods.xpath('string(.//mods:dateIssued)', namespaces=ns) issue.date_issued = datetime.strptime(date_issued, '%Y-%m-%d') # attach the Issue to the appropriate Title lccn = mods.xpath('string(.//mods:identifier[@type="lccn"])', namespaces=ns).strip() try: title = Title.objects.get(lccn=lccn) except Exception as e: url = settings.MARC_RETRIEVAL_URLFORMAT % lccn _logger.info("attempting to load marc record from %s", url) management.call_command('load_titles', url) title = Title.objects.get(lccn=lccn) issue.title = title issue.batch = self.current_batch issue.save() _logger.debug("saved issue: %s" % issue.url) notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)') note = models.IssueNote(type=type, label=label, text=text) notes.append(note) issue.notes.set(notes, bulk=False) issue.save() # attach pages: lots of logging because it's expensive for page_div in div.xpath('.//mets:div[@TYPE="np:page"]', namespaces=ns): try: page = self._load_page(doc, page_div, issue) self.pages_processed += 1 except BatchLoaderException as e: _logger.exception(e) return issue def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath('string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError as e: raise BatchLoaderException( "could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath('string(.//mods:detail[@type="page number"])', namespaces=ns).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist as e: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: _logger.warn("unable to find reel number in page metadata") _logger.info("Assigned page sequence: %s" % page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue _logger.info("Saving page. issue date: %s, page sequence: %s" % (issue.date_issued, page.sequence)) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)').strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes.set(notes, bulk=False) # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath('./mets:fptr', namespaces=ns): file_id = fptr.attrib['FILEID'] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib['USE'] # get the filename relative to the storage location file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)', namespaces=ns) file_name = urllib.parse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == 'master': page.tiff_filename = file_name elif file_type == 'service': page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib['ADMID'].split(' '): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError as e: _logger.info( "Could not determine dimensions of jp2 for issue: %s page: %s... trying harder..." % (page.issue, page)) im = Image.open(page.jp2_abs_filename) page.jp2_width, page.jp2_length = im.size if not page.jp2_width: raise BatchLoaderException( "No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException( "No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == 'derivative': page.pdf_filename = file_name elif file_type == 'ocr': page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: self.process_ocr(page) else: _logger.info("No ocr filename for issue: %s page: %s" % (page.issue, page)) _logger.debug("saving page: %s" % page.url) page.save() return page def process_ocr(self, page, index=True): _logger.debug("extracting ocr text and word coords for %s" % page.url) url = urllib.parse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() for lang, text in lang_text.items(): try: language = models.Language.objects.get( Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: # default to english as per requirement language = models.Language.objects.get(code='eng') ocr.language_texts.create(language=language, text=text) page.ocr = ocr if index: _logger.debug("indexing ocr for: %s" % page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save() def _process_coordinates(self, page, coords): _logger.debug("writing out word coords for %s" % page.url) f = open(models.coordinates_path(page._url_parts()), "wb") f.write(gzip_compress(json.dumps(coords).encode('utf-8'))) f.close() def process_coordinates(self, batch_path): _logger.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urllib.parse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): url = urllib.parse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception as e: msg = "unable to process coordinates for batch: %s" % e _logger.error(msg) _logger.exception(e) raise BatchLoaderException(msg) def storage_relative_path(self, path): """returns a relative path for a given file path within a batch, so that storage can be re-homed without having to rewrite paths in the db """ rel_path = path.replace(self.current_batch.storage_url, '') return rel_path def purge_batch(self, batch_name): event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: batch = self._get_batch(batch_name) self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): _logger.info("Removing symlink %s", link_name) os.remove(link_name) # updates the min and max years of all titles set_fulltext_range() except Exception as e: msg = "purge failed: %s" % e _logger.error(msg) _logger.exception(e) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg) def _purge_batch(self, batch): batch_name = batch.name # just delete batch causes memory to bloat out # so we do it piece-meal for issue in batch.issues.all(): for page in issue.pages.all(): page.delete() # remove coordinates if os.path.exists(models.coordinates_path(page._url_parts())): os.remove(models.coordinates_path(page._url_parts())) reset_queries() issue.delete() batch.delete() if self.PROCESS_OCR: self.solr.delete_query('batch:"%s"' % batch_name) self.solr.commit()
def submit(self, **kwargs): # Grab the user and request details json_data = json.loads(kwargs['json']) user_id = json_data['userid'] channel_id = self.channel_id requests = json_data['requests'] # Create a local representation of the requests tracks = [] for request in requests: source_id = request['sourceid'] track_id = request['trackid'] # Build up a Solr query filters = [] filters.append('type:track') filters.append('request_source_id:%s' % source_id) filters.append('request_track_id:%s' % track_id) # Make the request to Solr solr = SolrConnection(settings.SOLR_URL) response = solr.select(q = ' AND '.join(filters), fields = 'track_artist, track_album, track_title') if len(response.results) == 1: track = { 'id': 'request_%s_%s_%s' % (source_id, track_id, user_id), 'type': 'request', 'channel_id': channel_id, 'track_artist': response.results[0]['track_artist'], 'track_album': response.results[0]['track_album'], 'track_title': response.results[0]['track_title'], 'request_user_id': user_id, 'request_source_id': source_id, 'request_track_id': track_id } tracks.append(track) # Create the request in the search engine solr = SolrConnection(settings.SOLR_URL) solr.add_many(tracks) solr.commit() solr.close() # Log the request to the database db = psycopg2.connect(database='airjukebox') cr = db.cursor() for track in tracks: cr.execute('insert into tbrequests (userid, locationid, sourceid, trackid) values (%(request_user_id)s, %(channel_id)s, %(request_source_id)s, %(request_track_id)s)', track) db.commit() cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8' return json.dumps(tracks, ensure_ascii=False, indent=4).encode('utf-8')
def uninstall(field, data, path, length, mode): from solr import SolrConnection s = SolrConnection(SOLR_URL) s.delete_query('id:*') s.commit()
class BatchLoader(object): """This class allows you to load a batch into the database. A loader object serves as a context for a particular batch loading job. """ def __init__(self, process_ocr=True, process_coordinates=True): """Create a BatchLoader. The process_ocr parameter is used (mainly in testing) when we don't want to spend time actually extracting ocr text and indexing. """ self.PROCESS_OCR = process_ocr self.solr = SolrConnection(settings.SOLR) self.PROCESS_COORDINATES = process_coordinates def _find_batch_file(self, batch): """ TODO: Who can we toss the requirement at to make this available in a canonical location? """ # look for batch_1.xml, BATCH_1.xml, etc for alias in ["batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml", "BATCH_2.xml", "batch.xml"]: # TODO: might we want 'batch.xml' first? Leaving last for now to # minimize impact. url = urlparse.urljoin(batch.storage_url, alias) try: urllib2.urlopen(url) validated_batch_file = alias break except (urllib2.HTTPError, urllib2.URLError): continue else: raise BatchLoaderException( "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?" % batch.path) return validated_batch_file def _sanity_check_batch(self, batch): # if not os.path.exists(batch.path): # raise BatchLoaderException("batch does not exist at %s" % batch.path) #b = urllib2.urlopen(batch.url) batch.validated_batch_file = self._find_batch_file(batch) def load_batch(self, batch_path, strict=True): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ LOGGER.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if batch_path != link_name and not os.path.islink(link_name): LOGGER.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) if not strict: try: batch = Batch.objects.get(name=batch_name) LOGGER.info("Batch already loaded: %s", batch_name) return batch except Batch.DoesNotExist as e: pass LOGGER.info("loading batch: %s", batch_name) event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns): reel_number = e.attrib['reelNumber'].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist as e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath('ndnp:issue', namespaces=ns): mets_url = urlparse.urljoin(batch.storage_url, e.text) try: issue, pages = self._load_issue(mets_url) except ValueError as e: LOGGER.exception(e) continue # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: LOGGER.info("Adding pages to solr index from issue %s", issue.title) for page in pages: LOGGER.debug("indexing ocr for: %s", page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save() if self.PROCESS_OCR: LOGGER.info("Committing solr index") self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count LOGGER.info(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() except Exception as e: msg = "unable to load batch: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception as pbe: LOGGER.error("purge batch failed for failed load batch: %s", pbe) LOGGER.exception(pbe) raise BatchLoaderException(msg) if settings.IS_PRODUCTION: batch.released = datetime.now() batch.save() return batch def _get_batch(self, batch_name, batch_source=None, create=False): if create: batch = self._create_batch(batch_name, batch_source) else: batch = Batch.objects.get(name=batch_name) return batch def _create_batch(self, batch_name, batch_source): if Batch.objects.filter(name=batch_name).count() != 0: raise BatchLoaderException("batch %s already loaded" % batch_name) batch = Batch() batch.name = batch_name batch.source = batch_source try: parts = batch_name.split("_", 3) if len(parts) is 4: parts = parts[1:] awardee_org_code, name_part, version = parts batch.awardee = Awardee.objects.get(org_code=awardee_org_code) except Awardee.DoesNotExist: msg = "no awardee for org code: %s" % awardee_org_code LOGGER.error(msg) raise BatchLoaderException(msg) batch.save() return batch def _load_issue(self, mets_file): LOGGER.debug("parsing issue mets file: %s", mets_file) doc = etree.parse(mets_file) # get the mods for the issue div = doc.xpath('.//mets:div[@TYPE="np:issue"]', namespaces=ns)[0] dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) # set up a new Issue issue = Issue() issue.volume = mods.xpath( 'string(.//mods:detail[@type="volume"]/mods:number[1])', namespaces=ns).strip() issue.number = mods.xpath( 'string(.//mods:detail[@type="issue"]/mods:number[1])', namespaces=ns).strip() issue.edition = int(mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:number[1])', namespaces=ns)) issue.edition_label = mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:caption[1])', namespaces=ns).strip() # parse issue date date_issued = mods.xpath('string(.//mods:dateIssued)', namespaces=ns) issue.date_issued = datetime.strptime(date_issued, '%Y-%m-%d') # attach the Issue to the appropriate Title lccn = mods.xpath('string(.//mods:identifier[@type="lccn"])', namespaces=ns).strip() try: title = Title.objects.get(lccn=lccn) except Exception as e: url = 'http://chroniclingamerica.loc.gov/lccn/%s/marc.xml' % lccn LOGGER.info("attempting to load marc record from %s", url) management.call_command('load_titles', url) title = Title.objects.get(lccn=lccn) issue.title = title issue.batch = self.current_batch issue.save() LOGGER.debug("saved issue: %s", issue.url) notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)') note = models.IssueNote(type=type, label=label, text=text) notes.append(note) issue.notes = notes issue.save() # attach pages: lots of logging because it's expensive pages = [] for page_div in div.xpath('.//mets:div[@TYPE="np:page"]', namespaces=ns): try: pages.append(self._load_page(doc, page_div, issue)) except BatchLoaderException as e: LOGGER.error("Failed to load page. doc: %s, page div: %s, issue: %s", doc, page_div, issue) LOGGER.exception(e) return issue, pages def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath( 'string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError: raise BatchLoaderException("could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath( 'string(.//mods:detail[@type="page number"])', namespaces=ns ).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns ).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: LOGGER.warn("unable to find reel number in page metadata") LOGGER.info("Assigned page sequence: %s", page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue LOGGER.info("Saving page. issue date: %s, page sequence: %s", issue.date_issued, page.sequence) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)').strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes = notes # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath('./mets:fptr', namespaces=ns): file_id = fptr.attrib['FILEID'] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib['USE'] # get the filename relative to the storage location file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)', namespaces=ns) file_name = urlparse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == 'master': page.tiff_filename = file_name elif file_type == 'service': page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib['ADMID'].split(' '): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError: LOGGER.info("Could not determine dimensions of jp2 for issue: %s page: %s... trying harder...", page.issue, page) if not page.jp2_width: raise BatchLoaderException("No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException("No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == 'derivative': page.pdf_filename = file_name elif file_type == 'ocr': page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: page = self.process_ocr(page) else: LOGGER.info("No ocr filename for issue: %s page: %s", page.issue, page) LOGGER.debug("saving page: %s", page.url) page.save() return page def process_ocr(self, page): LOGGER.debug("extracting ocr text and word coords for %s", page.url) url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() lang_text_solr = {} for lang, text in lang_text.iteritems(): try: language = models.Language.objects.get(Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: LOGGER.warn("Language %s does not exist in the database. Defaulting to English.", lang) # default to english as per requirement language = models.Language.objects.get(code='eng') ocr.language_texts.create(language=language) lang_text_solr[language.code] = text page.ocr = ocr page.lang_text = lang_text_solr page.save() return page def _process_coordinates(self, page, coords): LOGGER.debug("writing out word coords for %s", page.url) fd, path = tempfile.mkstemp(text="w", suffix=".coordinates", dir=settings.TEMP_STORAGE) # get a temp file in case the coordinates dir is a NFS or S3 mount which have poor multiple write performance f = open(path, "w") f.write(gzip_compress(json.dumps(coords))) f.close() os.close(fd) final_path = models.coordinates_path(page._url_parts()) try: shutil.move(path, final_path) except Exception: LOGGER.warn("Could not move coordinates to [%s]. Waiting 5 seconds and trying again in case of network mount", final_path) time.sleep(5) shutil.move(path, final_path) def process_coordinates(self, batch_path): LOGGER.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): if not page.ocr_filename: LOGGER.warn("Batch [%s] has page [%s] that has no OCR. Skipping processing coordinates for page." % (batch_name, page)) else: url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) LOGGER.debug("Extracting OCR from url %s", url) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception as e: msg = "unable to process coordinates for batch: %s" % e LOGGER.exception(msg) raise BatchLoaderException(msg) def storage_relative_path(self, path): """returns a relative path for a given file path within a batch, so that storage can be re-homed without having to rewrite paths in the db """ rel_path = path.replace(self.current_batch.storage_url, '') return rel_path @transaction.atomic def purge_batch(self, batch_name): event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: batch = self._get_batch(batch_name) self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): LOGGER.info("Removing symlink %s", link_name) os.remove(link_name) except Exception as e: msg = "purge failed: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg) def _purge_batch(self, batch): batch_name = batch.name # just delete batch causes memory to bloat out # so we do it piece-meal for issue in batch.issues.all(): for page in issue.pages.all(): page.delete() # remove coordinates if os.path.exists(models.coordinates_path(page._url_parts())): os.remove(models.coordinates_path(page._url_parts())) issue.delete() batch.delete() if self.PROCESS_OCR: self.solr.delete_query('batch:"%s"' % batch_name) self.solr.commit()
def solr_index(med): lists = [] countries = [] sections = [] subsections = [] pharma_form_list = [] pharma_form_type_list = [] category_list = [] observation_list = [] # if medicine status is not active delete from solr index if not med.active: try: solr = SolrConnection(settings.SOLR_URL) solr.delete(id=str(med.id)) response = solr.commit() except Exception as ex: return False return True # index medicine on solr index medicine_translations = MedicineLocal.objects.filter(medicine=med.id) medicine_list = ['en^%s' % med.name.strip()] for translation in medicine_translations: medicine_list.append('%s^%s' % (translation.language, translation.name.strip())) medicine_list = "|".join(medicine_list) # ex.: en^codeine|pt-br^codeína|es^codeína # retrieve actives pharmaceutical forms of currente medicine pharm_forms = med.pharmaceuticalform_set.filter(active=True) for form in pharm_forms: # ex. ^enTablet|es^Tableta|pt-br^Comprimido pharma_form_type_translations = "|".join( form.pharmaceutical_form_type.get_translations() ) pharma_form_type_list.append(pharma_form_type_translations) # ex. ^enTablet|es^Tableta|pt-br^Comprimido|comp^15 mg/ml pharma_form_list.append('%s|comp^%s' % (pharma_form_type_translations, form.composition)) # create category_list (section and subsection where current pharmaceutical form is used on lists) section_pharm_form_list = SectionPharmForm.objects.filter(pharmaceutical_form=form) for section_pharm_form in section_pharm_form_list: #add observations of current section_pharm_form if section_pharm_form.only_for_children: observation_list.append('only_for_children') if section_pharm_form.specialist_care_for_children: observation_list.append('specialist_care_for_children') if section_pharm_form.restriction_age: observation_list.append('restriction_age') if section_pharm_form.best_evidence: observation_list.append('best_evidence') if section_pharm_form.observation: observation_list.append('observation') section = Section.objects.get(pk=section_pharm_form.section.id) section_translations = "|".join(section.get_translations()) section_tree = section.get_ancestors() if section_tree: for sec in section_tree: category_translations = "|".join(sec.get_translations()) if category_translations not in category_list: category_list.append(category_translations) if section_translations not in category_list: category_list.append(section_translations) list_associated = "|".join( section.list.get_translations() ) if section.list.type == 'c': if list_associated not in countries: countries.append(list_associated) else: if list_associated not in lists: lists.append(list_associated) #check if current medicine have Evidence summaries has_evidence = None evidence_total = MedicineEvidenceSummary.objects.filter(medicine=med.id).count() if evidence_total > 0: has_evidence = "true" # try to create a connection to a solr server and send medicine try: solr = SolrConnection(settings.SOLR_URL) solr.add( id = str(med.id), type = "medicine", name = medicine_list, pharmaceutical_form = pharma_form_list, pharmaceutical_form_type = pharma_form_type_list, list=lists, country=countries, category=category_list, observation=observation_list, has_evidence=has_evidence, ) response = solr.commit() except Exception as ex: return False return True
def handle(self, **options): solr = SolrConnection(settings.SOLR) solr.commit()
def solr_index(med): lists = [] countries = [] sections = [] subsections = [] pharma_form_list = [] pharma_form_type_list = [] category_list = [] observation_list = [] # if medicine status is not active delete from solr index if not med.active: try: solr = SolrConnection(settings.SOLR_URL) solr.delete(id=str(med.id)) response = solr.commit() except Exception as ex: return False return True # index medicine on solr index medicine_translations = MedicineLocal.objects.filter(medicine=med.id) medicine_list = ['en^%s' % med.name.strip()] for translation in medicine_translations: medicine_list.append('%s^%s' % (translation.language, translation.name.strip())) medicine_list = "|".join( medicine_list) # ex.: en^codeine|pt-br^codeína|es^codeína # retrieve actives pharmaceutical forms of currente medicine pharm_forms = med.pharmaceuticalform_set.filter(active=True) for form in pharm_forms: # ex. ^enTablet|es^Tableta|pt-br^Comprimido pharma_form_type_translations = "|".join( form.pharmaceutical_form_type.get_translations()) pharma_form_type_list.append(pharma_form_type_translations) # ex. ^enTablet|es^Tableta|pt-br^Comprimido|comp^15 mg/ml pharma_form_list.append( '%s|comp^%s' % (pharma_form_type_translations, form.composition)) # create category_list (section and subsection where current pharmaceutical form is used on lists) section_pharm_form_list = SectionPharmForm.objects.filter( pharmaceutical_form=form) for section_pharm_form in section_pharm_form_list: #add observations of current section_pharm_form if section_pharm_form.only_for_children: observation_list.append('only_for_children') if section_pharm_form.specialist_care_for_children: observation_list.append('specialist_care_for_children') if section_pharm_form.restriction_age: observation_list.append('restriction_age') if section_pharm_form.best_evidence: observation_list.append('best_evidence') if section_pharm_form.observation: observation_list.append('observation') section = Section.objects.get(pk=section_pharm_form.section.id) section_translations = "|".join(section.get_translations()) section_tree = section.get_ancestors() if section_tree: for sec in section_tree: category_translations = "|".join(sec.get_translations()) if category_translations not in category_list: category_list.append(category_translations) if section_translations not in category_list: category_list.append(section_translations) list_associated = "|".join(section.list.get_translations()) if section.list.type == 'c': if list_associated not in countries: countries.append(list_associated) else: if list_associated not in lists: lists.append(list_associated) #check if current medicine have Evidence summaries has_evidence = None evidence_total = MedicineEvidenceSummary.objects.filter( medicine=med.id).count() if evidence_total > 0: has_evidence = "true" # try to create a connection to a solr server and send medicine try: solr = SolrConnection(settings.SOLR_URL) solr.add( id=str(med.id), type="medicine", name=medicine_list, pharmaceutical_form=pharma_form_list, pharmaceutical_form_type=pharma_form_type_list, list=lists, country=countries, category=category_list, observation=observation_list, has_evidence=has_evidence, ) response = solr.commit() except Exception as ex: return False return True
def index_pages(only_missing=False): """index all the pages that are modeled in the database """ solr = SolrConnection(settings.SOLR) page_qs = models.Page.objects.order_by("pk") if only_missing: page_qs = page_qs.filter(indexed=False) else: # FIXME: we should not churn the index when documents have not been deleted: solr.delete_query("type:page") # To avoid MySQL limitations, we'll run two queries: the first will only # lookup the primary keys to allow MySQL to satisfy the ORDER BY / LIMIT # using only the index and then we'll use the primary keys to lookup the # full Page objects for each chunk which will actually be indexed. full_page_qs = page_qs.prefetch_related( Prefetch( "issue", queryset=models.Issue.objects.prefetch_related( "batch", "title", "title__languages", "title__alt_titles", "title__subjects", "title__notes", "title__places", "title__urls", "title__essays", "title__country", "title__holdings", ), )) count = 0 for pk_chunk in sliced(page_qs.values_list("pk", flat=True), 100): # We have to force the PKs into a list to work around limitations in # MySQL preventing the use of a subquery which uses LIMIT: chunk = full_page_qs.filter(pk__in=list(pk_chunk)) docs = [] pks = [] for page in chunk: try: docs.append(page.solr_doc) pks.append(page.pk) except Exception: LOGGER.warning("Unable to index page %s", page.url, exc_info=True) continue if docs: solr.add_many(docs) solr.commit() models.Page.objects.filter(pk__in=pks).update(indexed=True) count += len(pk_chunk) reset_queries() LOGGER.info("indexed %d pages", count) solr.commit()