def incrementalIndexingJob(commitNonblocking=0): """the incremental indexing job scans for changed bibliographic records from Horizon and updates them accordingly in Solr.""" bibsToUpdate = horizonItemStatus.getChangedBibs(doDelete=0) print "\n[%s] updating %s bibs" % (time.ctime(), len(bibsToUpdate)) bibCount = 0 recordBatch = [] for bibOn in bibsToUpdate: bibCount += 1 availAt = horizonItemStatus.availableAt(bibOn) newRecordOn = horizonItemStatus.updateSolrRecordAvailability(bibOn, availAt, doPost=0) recordBatch.append(newRecordOn) # now delete item from queue horizonItemStatus.deleteFromIndexQueue(bibOn) print "-", if ((bibCount % SOLR_INDEX_BATCH_SIZE) == 0): data = u''.join(recordBatch) print "*", resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) recordBatch = [] print("+%s+" % bibOn), # now do last batch if len(recordBatch) > 0: data = u''.join(recordBatch) resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) if bibCount > 0: print "\n[%s] done updating bibs, now committing" % time.ctime() try: if commitNonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() except IOError: print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) print "[%s] done committing" % time.ctime() else: print "[%s] no bibs updated, exiting" % time.ctime() return bibCount
def incrementalIndexingJob(commitNonblocking=0): """the incremental indexing job scans for changed bibliographic records from Horizon and updates them accordingly in Solr.""" bibsToUpdate = horizonItemStatus.getChangedBibs(doDelete=0) print "\n[%s] updating %s bibs" % ( time.ctime(), len(bibsToUpdate) ) bibCount = 0 recordBatch = [] for bibOn in bibsToUpdate: bibCount +=1 availAt = horizonItemStatus.availableAt( bibOn ) newRecordOn = horizonItemStatus.updateSolrRecordAvailability( bibOn, availAt, doPost = 0) recordBatch.append( newRecordOn ) # now delete item from queue horizonItemStatus.deleteFromIndexQueue( bibOn ) print "-", if ( (bibCount % SOLR_INDEX_BATCH_SIZE) == 0): data = u''.join( recordBatch ) print "*", resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data ) recordBatch = [] print ("+%s+" % bibOn) , # now do last batch if len(recordBatch) > 0: data = u''.join( recordBatch ) resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data ) if bibCount > 0: print "\n[%s] done updating bibs, now committing" % time.ctime() try: if commitNonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() except IOError: print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) print "[%s] done committing" % time.ctime() else: print "[%s] no bibs updated, exiting" % time.ctime() return bibCount
def processFilesInDirectory(dirName, anselUnicodeConverter = None, commitNonblocking =0, numThreads = 1, deleteAfterIndexing = 1): """processes MARC and PMS files in the indexer-queue directory. If numThreads > 1 it will try to parallelize the MARC record processing (but not PMS indexing -- no reason for that) """ pmsFiles = glob.glob( "%s/PMS*.DAT" % dirName ) updatedAnyRecords = 0 count= 0 for fileOn in pmsFiles: print "processing PMS file %s" % fileOn processedFilenameOn, deletedBibsOn = processPMSFile(fileOn) if processedFilenameOn: print "processing MARC file %s" % processedFilenameOn indexerDriver.processFile( processedFilenameOn, anselUnicodeConverter ) # now that we are done processing the file, we delete it. print "deleting MARC file %s " % processedFilenameOn os.remove( processedFilenameOn ) print "deleting PMS file %s" % fileOn os.remove( fileOn ) if deletedBibsOn: print "processing deleted bibs from MARC file %s" % processedFilenameOn for bibOn in deletedBibsOn: print "deleting bib %s" % bibOn indexerDriver.deleteRecord( bibOn ) updatedAnyRecords = 1 else: print "no records to index" os.remove( fileOn ) print "[%s] now checking for MARC files" % time.ctime() _marcFiles = glob.glob( "%s/*.MARC" % dirName) _marcFiles += glob.glob ("%s/*.marc" % dirName ) _marcFiles += glob.glob ("%s/*.dat" % dirName ) _marcFiles += glob.glob ("%s/*.DAT" % dirName ) _marcFiles += glob.glob( "%s/*scriblio*" % dirName ) # dedupe _marcFiles here incase a file matches more than one glob # using a dictionary is the fastest way to dedupe a list with Jython marcFileDict = {} for fileOn in _marcFiles: marcFileDict[fileOn] = None marcFiles = marcFileDict.keys() marcFiles.sort() numMarcFiles = len(marcFiles) print "[%s] found %d files to process." % (time.ctime(), numMarcFiles ) if numThreads == 1: for fileOn in marcFiles: print "processing MARC file %s" % fileOn count = indexerDriver.processFile( fileOn, anselUnicodeConverter, nonblocking=1 ) # csdebug: added nonblocking here updatedAnyRecords = 1 if deleteAfterIndexing: os.remove( fileOn ) elif numThreads >= numMarcFiles: # spin off a thread for each one # was getting weird problems with multithreading (AttributeErrors when trying to iterate # over all controlFields in MARC record -- trying separate anselUnicodeConverters to see if that's the issue. threads = [] threadrefs = [] i = 0 for fileOn in marcFiles: convOn = AnselToUnicode() jobOn = indexerDriver.processFileJob( fileOn, convOn, nonblocking = 1, pid = i)# csdebug: handle nonblocking option _threadOn = Thread( jobOn, "process file job %s" %i ) threads.append( _threadOn ) threadrefs.append( jobOn ) print "starting thread %s processing file %s" % (i, fileOn) _threadOn.start() i += 1 updatedAnyRecords = 1 print "joining threads" for i in range( len(threads) ): threads[i].join() # TODO: make sure the thread was successful before nuking. if deleteAfterIndexing: print "deleting %s" % threadrefs[i].filename os.remove( threadrefs[i].filename ) else: # do work queue here. print "not yet implemented" # finally, do a commit here. if updatedAnyRecords: print "[%s] starting final commit" % time.ctime() if commitNonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() print "[%s] done committing" % time.ctime() return count
def processFilesInDirectory(dirName, anselUnicodeConverter=None, commitNonblocking=0, numThreads=1, deleteAfterIndexing=1): """processes MARC and PMS files in the indexer-queue directory. If numThreads > 1 it will try to parallelize the MARC record processing (but not PMS indexing -- no reason for that) """ pmsFiles = glob.glob("%s/PMS*.DAT" % dirName) updatedAnyRecords = 0 count = 0 for fileOn in pmsFiles: print "processing PMS file %s" % fileOn processedFilenameOn, deletedBibsOn = processPMSFile(fileOn) if processedFilenameOn: print "processing MARC file %s" % processedFilenameOn indexerDriver.processFile(processedFilenameOn, anselUnicodeConverter) # now that we are done processing the file, we delete it. print "deleting MARC file %s " % processedFilenameOn os.remove(processedFilenameOn) print "deleting PMS file %s" % fileOn os.remove(fileOn) if deletedBibsOn: print "processing deleted bibs from MARC file %s" % processedFilenameOn for bibOn in deletedBibsOn: print "deleting bib %s" % bibOn indexerDriver.deleteRecord(bibOn) updatedAnyRecords = 1 else: print "no records to index" os.remove(fileOn) print "[%s] now checking for MARC files" % time.ctime() _marcFiles = glob.glob("%s/*.MARC" % dirName) _marcFiles += glob.glob("%s/*.marc" % dirName) _marcFiles += glob.glob("%s/*.dat" % dirName) _marcFiles += glob.glob("%s/*.DAT" % dirName) _marcFiles += glob.glob("%s/*scriblio*" % dirName) # dedupe _marcFiles here incase a file matches more than one glob # using a dictionary is the fastest way to dedupe a list with Jython marcFileDict = {} for fileOn in _marcFiles: marcFileDict[fileOn] = None marcFiles = marcFileDict.keys() marcFiles.sort() numMarcFiles = len(marcFiles) print "[%s] found %d files to process." % (time.ctime(), numMarcFiles) if numThreads == 1: for fileOn in marcFiles: print "processing MARC file %s" % fileOn count = indexerDriver.processFile( fileOn, anselUnicodeConverter, nonblocking=1) # csdebug: added nonblocking here updatedAnyRecords = 1 if deleteAfterIndexing: os.remove(fileOn) elif numThreads >= numMarcFiles: # spin off a thread for each one # was getting weird problems with multithreading (AttributeErrors when trying to iterate # over all controlFields in MARC record -- trying separate anselUnicodeConverters to see if that's the issue. threads = [] threadrefs = [] i = 0 for fileOn in marcFiles: convOn = AnselToUnicode() jobOn = indexerDriver.processFileJob( fileOn, convOn, nonblocking=1, pid=i) # csdebug: handle nonblocking option _threadOn = Thread(jobOn, "process file job %s" % i) threads.append(_threadOn) threadrefs.append(jobOn) print "starting thread %s processing file %s" % (i, fileOn) _threadOn.start() i += 1 updatedAnyRecords = 1 print "joining threads" for i in range(len(threads)): threads[i].join() # TODO: make sure the thread was successful before nuking. if deleteAfterIndexing: print "deleting %s" % threadrefs[i].filename os.remove(threadrefs[i].filename) else: # do work queue here. print "not yet implemented" # finally, do a commit here. if updatedAnyRecords: print "[%s] starting final commit" % time.ctime() if commitNonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() print "[%s] done committing" % time.ctime() return count
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1): # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking. inStream = FileInputStream(filename) print "processFile>> %s" % filename marcReader = MarcStreamReader(inStream) data = "" count = 0 lastCommitTime = None import time startTime = time.time() lastRecord = None lastBibNum = None m4j = None marcReaderTime = 0 marcRecordToDictTime = 0 extractorCreateTime = 0 extractionTime = 0 extractMethodTime = 0 marcRecordForSolrTime = 0 commitTime = 0 updateTime = 0 marcSerializeTime = 0 accession = 0 # TODO: try and load serialized accession # from somewhere serializedRecord = None recordBatch = [] # get default properties file from loadPropsFile import * props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE) while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD: # if pid > -1: # print (".%d" % pid), # else: # print ".", # CSDEBUG accession += 1 count += 1 # TODO: improve error handling here (main problem is that Marc4J will fall over # at the sight of a bad record and there's no way to get it to just skip over # a bad record -- so there is little we can do, except better error messages! try: mrTimeStart = time.time() marc4jRecord = marcReader.next() marcReaderTime += time.time() - mrTimeStart except: print "last record indexed was bib# %s " % lastBibNum import sys print "sys.exc_info is %s" % str(sys.exc_info()) sys.exit(1) mrsTime = time.time() # try: rec = solrIndexingUtils.recordForSolr(marc4jRecord, anselUnicodeConverter, propsObject=props) # except: # print "exception processing record, skipping" # TODO: error handling # continue marcRecordForSolrTime += time.time() - mrsTime extractionTime += rec._extractionTime extractorCreateTime += rec._extractorCreateTime marcRecordToDictTime += rec._marcRecordToDictTime extractMethodTime += rec._extractMethodTime if hasattr(rec, "bib_num"): recordBatch.append(rec) lastBibNum = rec.bib_num else: print "not adding record %s; no bib_num present!" % rec if (count % SOLR_INDEX_BATCH_SIZE) == 0: # nb. neither apache commons nor python urllib works right here! Unicode gets mangled. # Must use postURL # fetch the item status info if required. if DO_ITEM_STATUS_INDEXING: bibs = [x.bib_num for x in recordBatch] avail = horizonItemStatus.availableAt(bibs) for x in recordBatch: x.available = avail[x.bib_num] mrserTime = time.time() data = u"".join([x.serialize() for x in recordBatch]) recordBatch = [] marcSerializeTime += time.time() - mrserTime startUpdateTime = time.time() try: resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) except IOError: print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) # if it fails again here, we want to just bomb out. if resp.find('<result status="1"') > -1: print "\nError POSTing documents! Response from Solr was\n\n%s\n" % resp # TODO: put in retry/continue code here for failed updates/slowdowns on Solr # TODO: parse result status and do something if there is an error (like print stacktrace) updateTime += time.time() - startUpdateTime if pid > -1: print ("*%d" % pid), else: print "*", if PRINT_SOLR_POST_DATA: print "\n\n<add>%s</add>\n\n" % data data = "" if (count % SOLR_COMMIT_BATCH_SIZE) == 0: try: print "committing..." beginCommitTime = time.time() if nonblocking: print "doing nonblocking commit" solrConnection.commitNonblocking() else: solrConnection.commit() commitTime += time.time() - beginCommitTime except IOError: import time print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) if lastCommitTime: thisBatchRate = (0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime) overallRate = (0.0 + count) / (time.time() - startTime) if pid > -1: print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid # csdebug print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % ( time.ctime(), count, thisBatchRate, overallRate, ) if PROFILE: print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % ( marcReaderTime, marcRecordForSolrTime, marcSerializeTime, ) print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % ( marcRecordToDictTime, extractorCreateTime, extractionTime, extractMethodTime, ) print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (updateTime, commitTime) lastCommitTime = time.time() if (count % SOLR_OPTIMIZE_BATCH_SIZE) == 0: print "[%s] FORCING OPTIMIZE..." % time.ctime() solrConnection.optimize() print "[%s] OPTIMIZE done" % time.ctime() System.gc() # do last batch here if len(recordBatch) > 0: print "doing final POST" mrserTime = time.time() data = "".join([x.serialize() for x in recordBatch]) recordBatch = [] resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) if resp.find('<result status="1"') > -1: print "\nError POSTing documents! Response from Solr was\n\n%s\n\n" % resp print "committing..." if nonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() inStream.close() return count
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1): # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking. inStream = FileInputStream(filename) print "processFile>> %s" % filename marcReader = MarcStreamReader(inStream) data = "" count = 0 lastCommitTime = None import time startTime = time.time() lastRecord = None lastBibNum = None m4j = None marcReaderTime = 0 marcRecordToDictTime = 0 extractorCreateTime = 0 extractionTime = 0 extractMethodTime = 0 marcRecordForSolrTime = 0 commitTime = 0 updateTime = 0 marcSerializeTime = 0 accession = 0 # TODO: try and load serialized accession # from somewhere serializedRecord = None recordBatch = [] # get default properties file from loadPropsFile import * props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE) while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD: #if pid > -1: # print (".%d" % pid), #else: # print ".", # CSDEBUG accession += 1 count += 1 # TODO: improve error handling here (main problem is that Marc4J will fall over # at the sight of a bad record and there's no way to get it to just skip over # a bad record -- so there is little we can do, except better error messages! try: mrTimeStart = time.time() marc4jRecord = marcReader.next() marcReaderTime += (time.time() - mrTimeStart) except: print "last record indexed was bib# %s " % lastBibNum import sys print "sys.exc_info is %s" % str(sys.exc_info()) sys.exit(1) mrsTime = time.time() #try: rec = solrIndexingUtils.recordForSolr(marc4jRecord, anselUnicodeConverter, propsObject=props) #except: # print "exception processing record, skipping" # TODO: error handling # continue marcRecordForSolrTime += (time.time() - mrsTime) extractionTime += rec._extractionTime extractorCreateTime += rec._extractorCreateTime marcRecordToDictTime += rec._marcRecordToDictTime extractMethodTime += rec._extractMethodTime if hasattr(rec, "bib_num"): recordBatch.append(rec) lastBibNum = rec.bib_num else: print "not adding record %s; no bib_num present!" % rec if ((count % SOLR_INDEX_BATCH_SIZE) == 0): # nb. neither apache commons nor python urllib works right here! Unicode gets mangled. #Must use postURL # fetch the item status info if required. if DO_ITEM_STATUS_INDEXING: bibs = [x.bib_num for x in recordBatch] avail = horizonItemStatus.availableAt(bibs) for x in recordBatch: x.available = avail[x.bib_num] mrserTime = time.time() data = u''.join([x.serialize() for x in recordBatch]) recordBatch = [] marcSerializeTime += (time.time() - mrserTime) startUpdateTime = time.time() try: resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) except IOError: print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) # if it fails again here, we want to just bomb out. if resp.find('<result status="1"') > -1: print "\nError POSTing documents! Response from Solr was\n\n%s\n" % resp # TODO: put in retry/continue code here for failed updates/slowdowns on Solr # TODO: parse result status and do something if there is an error (like print stacktrace) updateTime += (time.time() - startUpdateTime) if pid > -1: print("*%d" % pid), else: print "*", if PRINT_SOLR_POST_DATA: print "\n\n<add>%s</add>\n\n" % data data = "" if ((count % SOLR_COMMIT_BATCH_SIZE) == 0): try: print "committing..." beginCommitTime = time.time() if nonblocking: print "doing nonblocking commit" solrConnection.commitNonblocking() else: solrConnection.commit() commitTime += (time.time() - beginCommitTime) except IOError: import time print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) if lastCommitTime: thisBatchRate = ((0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime)) overallRate = ((0.0 + count) / (time.time() - startTime)) if pid > -1: print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid # csdebug print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % ( time.ctime(), count, thisBatchRate, overallRate) if PROFILE: print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % ( marcReaderTime, marcRecordForSolrTime, marcSerializeTime) print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % ( marcRecordToDictTime, extractorCreateTime, extractionTime, extractMethodTime) print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % ( updateTime, commitTime) lastCommitTime = time.time() if ((count % SOLR_OPTIMIZE_BATCH_SIZE) == 0): print "[%s] FORCING OPTIMIZE..." % time.ctime() solrConnection.optimize() print "[%s] OPTIMIZE done" % time.ctime() System.gc() # do last batch here if len(recordBatch) > 0: print "doing final POST" mrserTime = time.time() data = ''.join([x.serialize() for x in recordBatch]) recordBatch = [] resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) if resp.find('<result status="1"') > -1: print "\nError POSTing documents! Response from Solr was\n\n%s\n\n" % resp print "committing..." if nonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() inStream.close() return count