def incrementalIndexingJob(commitNonblocking=0): """the incremental indexing job scans for changed bibliographic records from Horizon and updates them accordingly in Solr.""" bibsToUpdate = horizonItemStatus.getChangedBibs(doDelete=0) print "\n[%s] updating %s bibs" % (time.ctime(), len(bibsToUpdate)) bibCount = 0 recordBatch = [] for bibOn in bibsToUpdate: bibCount += 1 availAt = horizonItemStatus.availableAt(bibOn) newRecordOn = horizonItemStatus.updateSolrRecordAvailability(bibOn, availAt, doPost=0) recordBatch.append(newRecordOn) # now delete item from queue horizonItemStatus.deleteFromIndexQueue(bibOn) print "-", if ((bibCount % SOLR_INDEX_BATCH_SIZE) == 0): data = u''.join(recordBatch) print "*", resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) recordBatch = [] print("+%s+" % bibOn), # now do last batch if len(recordBatch) > 0: data = u''.join(recordBatch) resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) if bibCount > 0: print "\n[%s] done updating bibs, now committing" % time.ctime() try: if commitNonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() except IOError: print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) print "[%s] done committing" % time.ctime() else: print "[%s] no bibs updated, exiting" % time.ctime() return bibCount
def incrementalIndexingJob(commitNonblocking=0): """the incremental indexing job scans for changed bibliographic records from Horizon and updates them accordingly in Solr.""" bibsToUpdate = horizonItemStatus.getChangedBibs(doDelete=0) print "\n[%s] updating %s bibs" % ( time.ctime(), len(bibsToUpdate) ) bibCount = 0 recordBatch = [] for bibOn in bibsToUpdate: bibCount +=1 availAt = horizonItemStatus.availableAt( bibOn ) newRecordOn = horizonItemStatus.updateSolrRecordAvailability( bibOn, availAt, doPost = 0) recordBatch.append( newRecordOn ) # now delete item from queue horizonItemStatus.deleteFromIndexQueue( bibOn ) print "-", if ( (bibCount % SOLR_INDEX_BATCH_SIZE) == 0): data = u''.join( recordBatch ) print "*", resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data ) recordBatch = [] print ("+%s+" % bibOn) , # now do last batch if len(recordBatch) > 0: data = u''.join( recordBatch ) resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data ) if bibCount > 0: print "\n[%s] done updating bibs, now committing" % time.ctime() try: if commitNonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() except IOError: print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) print "[%s] done committing" % time.ctime() else: print "[%s] no bibs updated, exiting" % time.ctime() return bibCount
def updateSolrRecordAvailability(bibNum, availableLocations=[], doPost=1): """this function updates an already-indexed record from Solr with new location information. It does this by grabbing the record in python format, evaling it as a python object, updating that one attribute and then re-serializing it as XML. WHEW/UGH! The method is so convoluted because Lucene does not allow you to update just one field in an indexed document -- you must delete the whole document and re-add it. """ urlToGet = "%s?q=bib_num:%s&wt=python" % (SOLR_QUERY_URL, bibNum) # TODO: better error handling try: u = urllib.urlopen(urlToGet) data = u.read() u.close() except IOError: # connections sometimes reset; sleep a little while and try again -- if fails again, bomb out with error. print "IOError while trying to get URL %s" % urlToGet time.sleep(1.0) u = urllib.urlopen(urlToGet) data = u.read() u.close() try: evData = eval(data) except: print "error while trying to eval record for %s" % bibNum try: docOn = evData['response']['docs'][0] except: # some error was reached; just toss it back return data docOn['available'] = availableLocations # now serialize the updated doc as XML ret = u"" for kOn in docOn.keys(): valueOn = docOn[kOn] if type(valueOn) == type([]): for subvalueOn in valueOn: sv = str(subvalueOn).replace("&", "&").replace( "<", "&lt;").replace(">", "&gt;") ret += u"""<field name="%s">%s</field>""" % (kOn, sv) else: val = str(valueOn).replace("&", "&").replace( "<", "&lt;").replace(">", "&gt;") ret += u"""<field name="%s">%s</field>""" % (kOn, val) # now POST update if required if doPost: resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add><doc>%s</doc></add>" % ret) return resp else: return "<doc>%s</doc>" % ret
def updateSolrRecordAvailability( bibNum, availableLocations = [], doPost =1 ): """this function updates an already-indexed record from Solr with new location information. It does this by grabbing the record in python format, evaling it as a python object, updating that one attribute and then re-serializing it as XML. WHEW/UGH! The method is so convoluted because Lucene does not allow you to update just one field in an indexed document -- you must delete the whole document and re-add it. """ urlToGet = "%s?q=bib_num:%s&wt=python" % ( SOLR_QUERY_URL, bibNum ) # TODO: better error handling try: u = urllib.urlopen(urlToGet) data = u.read() u.close() except IOError: # connections sometimes reset; sleep a little while and try again -- if fails again, bomb out with error. print "IOError while trying to get URL %s" % urlToGet time.sleep(1.0) u = urllib.urlopen(urlToGet) data = u.read() u.close() try: evData = eval(data) except: print "error while trying to eval record for %s" % bibNum try: docOn = evData['response']['docs'][0] except: # some error was reached; just toss it back return data docOn['available'] = availableLocations # now serialize the updated doc as XML ret = u"" for kOn in docOn.keys(): valueOn = docOn[kOn] if type(valueOn) == type([]): for subvalueOn in valueOn: sv = str(subvalueOn).replace("&", "&").replace("<", "&lt;").replace(">", "&gt;") ret += u"""<field name="%s">%s</field>""" % ( kOn, sv ) else: val = str(valueOn).replace("&", "&").replace("<", "&lt;").replace(">", "&gt;") ret += u"""<field name="%s">%s</field>""" % ( kOn, val ) # now POST update if required if doPost: resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add><doc>%s</doc></add>" % ret ) return resp else: return "<doc>%s</doc>" % ret
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1): # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking. inStream = FileInputStream(filename) print "processFile>> %s" % filename marcReader = MarcStreamReader(inStream) data = "" count = 0 lastCommitTime = None import time startTime = time.time() lastRecord = None lastBibNum = None m4j = None marcReaderTime = 0 marcRecordToDictTime = 0 extractorCreateTime = 0 extractionTime = 0 extractMethodTime = 0 marcRecordForSolrTime = 0 commitTime = 0 updateTime = 0 marcSerializeTime = 0 accession = 0 # TODO: try and load serialized accession # from somewhere serializedRecord = None recordBatch = [] # get default properties file from loadPropsFile import * props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE) while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD: # if pid > -1: # print (".%d" % pid), # else: # print ".", # CSDEBUG accession += 1 count += 1 # TODO: improve error handling here (main problem is that Marc4J will fall over # at the sight of a bad record and there's no way to get it to just skip over # a bad record -- so there is little we can do, except better error messages! try: mrTimeStart = time.time() marc4jRecord = marcReader.next() marcReaderTime += time.time() - mrTimeStart except: print "last record indexed was bib# %s " % lastBibNum import sys print "sys.exc_info is %s" % str(sys.exc_info()) sys.exit(1) mrsTime = time.time() # try: rec = solrIndexingUtils.recordForSolr(marc4jRecord, anselUnicodeConverter, propsObject=props) # except: # print "exception processing record, skipping" # TODO: error handling # continue marcRecordForSolrTime += time.time() - mrsTime extractionTime += rec._extractionTime extractorCreateTime += rec._extractorCreateTime marcRecordToDictTime += rec._marcRecordToDictTime extractMethodTime += rec._extractMethodTime if hasattr(rec, "bib_num"): recordBatch.append(rec) lastBibNum = rec.bib_num else: print "not adding record %s; no bib_num present!" % rec if (count % SOLR_INDEX_BATCH_SIZE) == 0: # nb. neither apache commons nor python urllib works right here! Unicode gets mangled. # Must use postURL # fetch the item status info if required. if DO_ITEM_STATUS_INDEXING: bibs = [x.bib_num for x in recordBatch] avail = horizonItemStatus.availableAt(bibs) for x in recordBatch: x.available = avail[x.bib_num] mrserTime = time.time() data = u"".join([x.serialize() for x in recordBatch]) recordBatch = [] marcSerializeTime += time.time() - mrserTime startUpdateTime = time.time() try: resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) except IOError: print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) # if it fails again here, we want to just bomb out. if resp.find('<result status="1"') > -1: print "\nError POSTing documents! Response from Solr was\n\n%s\n" % resp # TODO: put in retry/continue code here for failed updates/slowdowns on Solr # TODO: parse result status and do something if there is an error (like print stacktrace) updateTime += time.time() - startUpdateTime if pid > -1: print ("*%d" % pid), else: print "*", if PRINT_SOLR_POST_DATA: print "\n\n<add>%s</add>\n\n" % data data = "" if (count % SOLR_COMMIT_BATCH_SIZE) == 0: try: print "committing..." beginCommitTime = time.time() if nonblocking: print "doing nonblocking commit" solrConnection.commitNonblocking() else: solrConnection.commit() commitTime += time.time() - beginCommitTime except IOError: import time print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) if lastCommitTime: thisBatchRate = (0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime) overallRate = (0.0 + count) / (time.time() - startTime) if pid > -1: print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid # csdebug print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % ( time.ctime(), count, thisBatchRate, overallRate, ) if PROFILE: print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % ( marcReaderTime, marcRecordForSolrTime, marcSerializeTime, ) print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % ( marcRecordToDictTime, extractorCreateTime, extractionTime, extractMethodTime, ) print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (updateTime, commitTime) lastCommitTime = time.time() if (count % SOLR_OPTIMIZE_BATCH_SIZE) == 0: print "[%s] FORCING OPTIMIZE..." % time.ctime() solrConnection.optimize() print "[%s] OPTIMIZE done" % time.ctime() System.gc() # do last batch here if len(recordBatch) > 0: print "doing final POST" mrserTime = time.time() data = "".join([x.serialize() for x in recordBatch]) recordBatch = [] resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) if resp.find('<result status="1"') > -1: print "\nError POSTing documents! Response from Solr was\n\n%s\n\n" % resp print "committing..." if nonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() inStream.close() return count
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1): # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking. inStream = FileInputStream(filename) print "processFile>> %s" % filename marcReader = MarcStreamReader(inStream) data = "" count = 0 lastCommitTime = None import time startTime = time.time() lastRecord = None lastBibNum = None m4j = None marcReaderTime = 0 marcRecordToDictTime = 0 extractorCreateTime = 0 extractionTime = 0 extractMethodTime = 0 marcRecordForSolrTime = 0 commitTime = 0 updateTime = 0 marcSerializeTime = 0 accession = 0 # TODO: try and load serialized accession # from somewhere serializedRecord = None recordBatch = [] # get default properties file from loadPropsFile import * props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE) while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD: #if pid > -1: # print (".%d" % pid), #else: # print ".", # CSDEBUG accession += 1 count += 1 # TODO: improve error handling here (main problem is that Marc4J will fall over # at the sight of a bad record and there's no way to get it to just skip over # a bad record -- so there is little we can do, except better error messages! try: mrTimeStart = time.time() marc4jRecord = marcReader.next() marcReaderTime += (time.time() - mrTimeStart) except: print "last record indexed was bib# %s " % lastBibNum import sys print "sys.exc_info is %s" % str(sys.exc_info()) sys.exit(1) mrsTime = time.time() #try: rec = solrIndexingUtils.recordForSolr(marc4jRecord, anselUnicodeConverter, propsObject=props) #except: # print "exception processing record, skipping" # TODO: error handling # continue marcRecordForSolrTime += (time.time() - mrsTime) extractionTime += rec._extractionTime extractorCreateTime += rec._extractorCreateTime marcRecordToDictTime += rec._marcRecordToDictTime extractMethodTime += rec._extractMethodTime if hasattr(rec, "bib_num"): recordBatch.append(rec) lastBibNum = rec.bib_num else: print "not adding record %s; no bib_num present!" % rec if ((count % SOLR_INDEX_BATCH_SIZE) == 0): # nb. neither apache commons nor python urllib works right here! Unicode gets mangled. #Must use postURL # fetch the item status info if required. if DO_ITEM_STATUS_INDEXING: bibs = [x.bib_num for x in recordBatch] avail = horizonItemStatus.availableAt(bibs) for x in recordBatch: x.available = avail[x.bib_num] mrserTime = time.time() data = u''.join([x.serialize() for x in recordBatch]) recordBatch = [] marcSerializeTime += (time.time() - mrserTime) startUpdateTime = time.time() try: resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) except IOError: print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) # if it fails again here, we want to just bomb out. if resp.find('<result status="1"') > -1: print "\nError POSTing documents! Response from Solr was\n\n%s\n" % resp # TODO: put in retry/continue code here for failed updates/slowdowns on Solr # TODO: parse result status and do something if there is an error (like print stacktrace) updateTime += (time.time() - startUpdateTime) if pid > -1: print("*%d" % pid), else: print "*", if PRINT_SOLR_POST_DATA: print "\n\n<add>%s</add>\n\n" % data data = "" if ((count % SOLR_COMMIT_BATCH_SIZE) == 0): try: print "committing..." beginCommitTime = time.time() if nonblocking: print "doing nonblocking commit" solrConnection.commitNonblocking() else: solrConnection.commit() commitTime += (time.time() - beginCommitTime) except IOError: import time print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) if lastCommitTime: thisBatchRate = ((0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime)) overallRate = ((0.0 + count) / (time.time() - startTime)) if pid > -1: print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid # csdebug print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % ( time.ctime(), count, thisBatchRate, overallRate) if PROFILE: print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % ( marcReaderTime, marcRecordForSolrTime, marcSerializeTime) print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % ( marcRecordToDictTime, extractorCreateTime, extractionTime, extractMethodTime) print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % ( updateTime, commitTime) lastCommitTime = time.time() if ((count % SOLR_OPTIMIZE_BATCH_SIZE) == 0): print "[%s] FORCING OPTIMIZE..." % time.ctime() solrConnection.optimize() print "[%s] OPTIMIZE done" % time.ctime() System.gc() # do last batch here if len(recordBatch) > 0: print "doing final POST" mrserTime = time.time() data = ''.join([x.serialize() for x in recordBatch]) recordBatch = [] resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) if resp.find('<result status="1"') > -1: print "\nError POSTing documents! Response from Solr was\n\n%s\n\n" % resp print "committing..." if nonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() inStream.close() return count