Ejemplo n.º 1
0
def incrementalIndexingJob(commitNonblocking=0):
    """the incremental indexing job scans for changed bibliographic records from Horizon and updates
    them accordingly in Solr."""
    bibsToUpdate = horizonItemStatus.getChangedBibs(doDelete=0)
    print "\n[%s] updating %s bibs" % (time.ctime(), len(bibsToUpdate))
    bibCount = 0
    recordBatch = []
    for bibOn in bibsToUpdate:
        bibCount += 1
        availAt = horizonItemStatus.availableAt(bibOn)
        newRecordOn = horizonItemStatus.updateSolrRecordAvailability(bibOn,
                                                                     availAt,
                                                                     doPost=0)
        recordBatch.append(newRecordOn)
        # now delete item from queue
        horizonItemStatus.deleteFromIndexQueue(bibOn)
        print "-",
        if ((bibCount % SOLR_INDEX_BATCH_SIZE) == 0):
            data = u''.join(recordBatch)
            print "*",
            resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                          "<add>%s</add>" % data)
            recordBatch = []
        print("+%s+" % bibOn),
    # now do last batch
    if len(recordBatch) > 0:
        data = u''.join(recordBatch)
        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
    if bibCount > 0:
        print "\n[%s] done updating bibs, now committing" % time.ctime()
        try:
            if commitNonblocking:
                solrConnection.commitNonblocking()
            else:
                solrConnection.commit()
        except IOError:
            print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
            time.sleep(10)
        print "[%s] done committing" % time.ctime()
    else:
        print "[%s] no bibs updated, exiting" % time.ctime()
    return bibCount
def incrementalIndexingJob(commitNonblocking=0):
    """the incremental indexing job scans for changed bibliographic records from Horizon and updates
    them accordingly in Solr."""
    bibsToUpdate = horizonItemStatus.getChangedBibs(doDelete=0) 
    print "\n[%s] updating %s bibs" % ( time.ctime(), len(bibsToUpdate) )
    bibCount = 0
    recordBatch = []
    for bibOn in bibsToUpdate:
        bibCount +=1
        availAt = horizonItemStatus.availableAt( bibOn )
        newRecordOn = horizonItemStatus.updateSolrRecordAvailability( bibOn, availAt, doPost = 0)
        recordBatch.append( newRecordOn )
        # now delete item from queue
        horizonItemStatus.deleteFromIndexQueue( bibOn )
        print "-",
        if ( (bibCount % SOLR_INDEX_BATCH_SIZE) == 0):
            data = u''.join( recordBatch )
            print "*",
            resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data )
            recordBatch = []
        print ("+%s+" % bibOn) ,
    # now do last batch
    if len(recordBatch) > 0:
        data = u''.join( recordBatch )
        resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data )
    if bibCount > 0:
        print "\n[%s] done updating bibs, now committing" % time.ctime()
        try:
            if commitNonblocking:
                solrConnection.commitNonblocking()
            else:
                solrConnection.commit()
        except IOError:
            print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
            time.sleep(10)
        print "[%s] done committing" % time.ctime()
    else:
        print "[%s] no bibs updated, exiting" % time.ctime()
    return bibCount
def processFilesInDirectory(dirName, anselUnicodeConverter = None, commitNonblocking =0, numThreads = 1, deleteAfterIndexing = 1):
    """processes MARC and PMS files in the indexer-queue directory.  If numThreads > 1
    it will try to parallelize the MARC record processing (but not PMS indexing -- no reason for that)     
    """
    pmsFiles = glob.glob( "%s/PMS*.DAT" % dirName )
    updatedAnyRecords = 0
    count= 0
    for fileOn in pmsFiles:
        print "processing PMS file %s" % fileOn
        processedFilenameOn, deletedBibsOn = processPMSFile(fileOn)
        if processedFilenameOn:
            print "processing MARC file %s" % processedFilenameOn
            indexerDriver.processFile( processedFilenameOn, anselUnicodeConverter )
            
            # now that we are done processing the file, we delete it.
            print "deleting MARC file %s " % processedFilenameOn
            os.remove( processedFilenameOn )
            print "deleting PMS file %s" % fileOn
            os.remove( fileOn )
            
            if deletedBibsOn:
                print "processing deleted bibs from MARC file %s" % processedFilenameOn
                for bibOn in deletedBibsOn:
                    print "deleting bib %s" % bibOn
                    indexerDriver.deleteRecord( bibOn )
            updatedAnyRecords = 1
        else:
            print "no records to index"
            os.remove( fileOn )

    print "[%s] now checking for MARC files" % time.ctime()
    _marcFiles = glob.glob( "%s/*.MARC" % dirName)
    _marcFiles += glob.glob ("%s/*.marc" % dirName )
    _marcFiles += glob.glob ("%s/*.dat" % dirName )
    _marcFiles += glob.glob ("%s/*.DAT" % dirName )
    _marcFiles += glob.glob( "%s/*scriblio*" % dirName )
    # dedupe _marcFiles here incase a file matches more than one glob
    # using a dictionary is the fastest way to dedupe a list with Jython
    marcFileDict = {}
    for fileOn in _marcFiles:
        marcFileDict[fileOn] = None
    marcFiles = marcFileDict.keys()
    marcFiles.sort()
    
    numMarcFiles = len(marcFiles)
    print "[%s] found %d files to process." % (time.ctime(), numMarcFiles )
    if numThreads == 1:
        for fileOn in marcFiles:
            print "processing MARC file %s" % fileOn
            count = indexerDriver.processFile( fileOn, anselUnicodeConverter, nonblocking=1 ) # csdebug: added nonblocking here
            updatedAnyRecords = 1
            if deleteAfterIndexing:
                os.remove( fileOn )
    elif numThreads >= numMarcFiles:
        # spin off a thread for each one
        # was getting weird problems with multithreading (AttributeErrors when trying to iterate
        # over all controlFields in MARC record -- trying separate anselUnicodeConverters to see if that's the issue.
        threads = []
        threadrefs = []
        i = 0
        for fileOn in marcFiles:
            convOn = AnselToUnicode()
            jobOn = indexerDriver.processFileJob( fileOn, convOn, nonblocking = 1, pid = i)# csdebug: handle nonblocking option
            _threadOn = Thread( jobOn, "process file job %s" %i )
            threads.append( _threadOn )
            threadrefs.append( jobOn )
            print "starting thread %s processing file %s" % (i, fileOn)
            _threadOn.start() 
            i += 1
            updatedAnyRecords = 1
        print "joining threads"
        for i in range( len(threads) ):
            threads[i].join()
            # TODO: make sure the thread was successful before nuking.
            if deleteAfterIndexing:
                print "deleting %s" % threadrefs[i].filename
                os.remove( threadrefs[i].filename )
            
    else:
        # do work queue here.
        print "not yet implemented"
    # finally, do a commit here.
    if updatedAnyRecords:
        print "[%s] starting final commit" % time.ctime()
        if commitNonblocking:
            solrConnection.commitNonblocking()
        else:
            solrConnection.commit()
        print "[%s] done committing" % time.ctime()
    return count
Ejemplo n.º 4
0
def processFilesInDirectory(dirName,
                            anselUnicodeConverter=None,
                            commitNonblocking=0,
                            numThreads=1,
                            deleteAfterIndexing=1):
    """processes MARC and PMS files in the indexer-queue directory.  If numThreads > 1
    it will try to parallelize the MARC record processing (but not PMS indexing -- no reason for that)     
    """
    pmsFiles = glob.glob("%s/PMS*.DAT" % dirName)
    updatedAnyRecords = 0
    count = 0
    for fileOn in pmsFiles:
        print "processing PMS file %s" % fileOn
        processedFilenameOn, deletedBibsOn = processPMSFile(fileOn)
        if processedFilenameOn:
            print "processing MARC file %s" % processedFilenameOn
            indexerDriver.processFile(processedFilenameOn,
                                      anselUnicodeConverter)

            # now that we are done processing the file, we delete it.
            print "deleting MARC file %s " % processedFilenameOn
            os.remove(processedFilenameOn)
            print "deleting PMS file %s" % fileOn
            os.remove(fileOn)

            if deletedBibsOn:
                print "processing deleted bibs from MARC file %s" % processedFilenameOn
                for bibOn in deletedBibsOn:
                    print "deleting bib %s" % bibOn
                    indexerDriver.deleteRecord(bibOn)
            updatedAnyRecords = 1
        else:
            print "no records to index"
            os.remove(fileOn)

    print "[%s] now checking for MARC files" % time.ctime()
    _marcFiles = glob.glob("%s/*.MARC" % dirName)
    _marcFiles += glob.glob("%s/*.marc" % dirName)
    _marcFiles += glob.glob("%s/*.dat" % dirName)
    _marcFiles += glob.glob("%s/*.DAT" % dirName)
    _marcFiles += glob.glob("%s/*scriblio*" % dirName)
    # dedupe _marcFiles here incase a file matches more than one glob
    # using a dictionary is the fastest way to dedupe a list with Jython
    marcFileDict = {}
    for fileOn in _marcFiles:
        marcFileDict[fileOn] = None
    marcFiles = marcFileDict.keys()
    marcFiles.sort()

    numMarcFiles = len(marcFiles)
    print "[%s] found %d files to process." % (time.ctime(), numMarcFiles)
    if numThreads == 1:
        for fileOn in marcFiles:
            print "processing MARC file %s" % fileOn
            count = indexerDriver.processFile(
                fileOn, anselUnicodeConverter,
                nonblocking=1)  # csdebug: added nonblocking here
            updatedAnyRecords = 1
            if deleteAfterIndexing:
                os.remove(fileOn)
    elif numThreads >= numMarcFiles:
        # spin off a thread for each one
        # was getting weird problems with multithreading (AttributeErrors when trying to iterate
        # over all controlFields in MARC record -- trying separate anselUnicodeConverters to see if that's the issue.
        threads = []
        threadrefs = []
        i = 0
        for fileOn in marcFiles:
            convOn = AnselToUnicode()
            jobOn = indexerDriver.processFileJob(
                fileOn, convOn, nonblocking=1,
                pid=i)  # csdebug: handle nonblocking option
            _threadOn = Thread(jobOn, "process file job %s" % i)
            threads.append(_threadOn)
            threadrefs.append(jobOn)
            print "starting thread %s processing file %s" % (i, fileOn)
            _threadOn.start()
            i += 1
            updatedAnyRecords = 1
        print "joining threads"
        for i in range(len(threads)):
            threads[i].join()
            # TODO: make sure the thread was successful before nuking.
            if deleteAfterIndexing:
                print "deleting %s" % threadrefs[i].filename
                os.remove(threadrefs[i].filename)

    else:
        # do work queue here.
        print "not yet implemented"
    # finally, do a commit here.
    if updatedAnyRecords:
        print "[%s] starting final commit" % time.ctime()
        if commitNonblocking:
            solrConnection.commitNonblocking()
        else:
            solrConnection.commit()
        print "[%s] done committing" % time.ctime()
    return count
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1):
    # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking.
    inStream = FileInputStream(filename)
    print "processFile>> %s" % filename
    marcReader = MarcStreamReader(inStream)
    data = ""
    count = 0
    lastCommitTime = None
    import time

    startTime = time.time()
    lastRecord = None
    lastBibNum = None
    m4j = None
    marcReaderTime = 0
    marcRecordToDictTime = 0
    extractorCreateTime = 0
    extractionTime = 0
    extractMethodTime = 0
    marcRecordForSolrTime = 0
    commitTime = 0
    updateTime = 0
    marcSerializeTime = 0
    accession = 0  # TODO: try and load serialized accession # from somewhere
    serializedRecord = None
    recordBatch = []
    # get default properties file
    from loadPropsFile import *

    props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE)

    while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD:
        # if pid > -1:
        #    print (".%d" % pid),
        # else:
        #    print ".",
        # CSDEBUG
        accession += 1
        count += 1
        # TODO: improve error handling here (main problem is that Marc4J will fall over
        # at the sight of a bad record and there's no way to get it to just skip over
        # a bad record -- so there is little we can do, except better error messages!
        try:
            mrTimeStart = time.time()
            marc4jRecord = marcReader.next()
            marcReaderTime += time.time() - mrTimeStart
        except:
            print "last record indexed was bib# %s " % lastBibNum
            import sys

            print "sys.exc_info is %s" % str(sys.exc_info())
            sys.exit(1)

        mrsTime = time.time()
        # try:
        rec = solrIndexingUtils.recordForSolr(marc4jRecord, anselUnicodeConverter, propsObject=props)
        # except:
        #    print "exception processing record, skipping"    # TODO: error handling
        #    continue
        marcRecordForSolrTime += time.time() - mrsTime
        extractionTime += rec._extractionTime
        extractorCreateTime += rec._extractorCreateTime
        marcRecordToDictTime += rec._marcRecordToDictTime
        extractMethodTime += rec._extractMethodTime

        if hasattr(rec, "bib_num"):
            recordBatch.append(rec)
            lastBibNum = rec.bib_num
        else:
            print "not adding record %s; no bib_num present!" % rec

        if (count % SOLR_INDEX_BATCH_SIZE) == 0:
            # nb. neither apache commons nor python urllib works right here!  Unicode gets mangled.
            # Must use postURL

            # fetch the item status info if required.
            if DO_ITEM_STATUS_INDEXING:
                bibs = [x.bib_num for x in recordBatch]
                avail = horizonItemStatus.availableAt(bibs)
                for x in recordBatch:
                    x.available = avail[x.bib_num]

            mrserTime = time.time()
            data = u"".join([x.serialize() for x in recordBatch])
            recordBatch = []
            marcSerializeTime += time.time() - mrserTime

            startUpdateTime = time.time()
            try:
                resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)

            except IOError:
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
                resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
                # if it fails again here, we want to just bomb out.
            if resp.find('<result status="1"') > -1:
                print "\nError POSTing documents!  Response from Solr was\n\n%s\n" % resp
            # TODO: put in retry/continue code here for failed updates/slowdowns on Solr
            # TODO: parse result status and do something if there is an error (like print stacktrace)
            updateTime += time.time() - startUpdateTime
            if pid > -1:
                print ("*%d" % pid),
            else:
                print "*",
            if PRINT_SOLR_POST_DATA:
                print "\n\n<add>%s</add>\n\n" % data
            data = ""
        if (count % SOLR_COMMIT_BATCH_SIZE) == 0:
            try:
                print "committing..."
                beginCommitTime = time.time()
                if nonblocking:
                    print "doing nonblocking commit"
                    solrConnection.commitNonblocking()
                else:
                    solrConnection.commit()
                commitTime += time.time() - beginCommitTime
            except IOError:
                import time

                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
            if lastCommitTime:
                thisBatchRate = (0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime)
                overallRate = (0.0 + count) / (time.time() - startTime)
                if pid > -1:
                    print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid  # csdebug
                print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % (
                    time.ctime(),
                    count,
                    thisBatchRate,
                    overallRate,
                )
                if PROFILE:
                    print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % (
                        marcReaderTime,
                        marcRecordForSolrTime,
                        marcSerializeTime,
                    )
                    print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % (
                        marcRecordToDictTime,
                        extractorCreateTime,
                        extractionTime,
                        extractMethodTime,
                    )
                    print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (updateTime, commitTime)
            lastCommitTime = time.time()
        if (count % SOLR_OPTIMIZE_BATCH_SIZE) == 0:
            print "[%s] FORCING OPTIMIZE..." % time.ctime()
            solrConnection.optimize()
            print "[%s] OPTIMIZE done" % time.ctime()
            System.gc()
    # do last batch here
    if len(recordBatch) > 0:
        print "doing final POST"
        mrserTime = time.time()
        data = "".join([x.serialize() for x in recordBatch])
        recordBatch = []

        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
        if resp.find('<result status="1"') > -1:
            print "\nError POSTing documents!  Response from Solr was\n\n%s\n\n" % resp
    print "committing..."
    if nonblocking:
        solrConnection.commitNonblocking()
    else:
        solrConnection.commit()
    inStream.close()
    return count
Ejemplo n.º 6
0
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1):
    # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking.
    inStream = FileInputStream(filename)
    print "processFile>> %s" % filename
    marcReader = MarcStreamReader(inStream)
    data = ""
    count = 0
    lastCommitTime = None
    import time
    startTime = time.time()
    lastRecord = None
    lastBibNum = None
    m4j = None
    marcReaderTime = 0
    marcRecordToDictTime = 0
    extractorCreateTime = 0
    extractionTime = 0
    extractMethodTime = 0
    marcRecordForSolrTime = 0
    commitTime = 0
    updateTime = 0
    marcSerializeTime = 0
    accession = 0  # TODO: try and load serialized accession # from somewhere
    serializedRecord = None
    recordBatch = []
    # get default properties file
    from loadPropsFile import *
    props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE)

    while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD:
        #if pid > -1:
        #    print (".%d" % pid),
        #else:
        #    print ".",
        # CSDEBUG
        accession += 1
        count += 1
        # TODO: improve error handling here (main problem is that Marc4J will fall over
        # at the sight of a bad record and there's no way to get it to just skip over
        # a bad record -- so there is little we can do, except better error messages!
        try:
            mrTimeStart = time.time()
            marc4jRecord = marcReader.next()
            marcReaderTime += (time.time() - mrTimeStart)
        except:
            print "last record indexed was bib# %s " % lastBibNum
            import sys
            print "sys.exc_info is %s" % str(sys.exc_info())
            sys.exit(1)

        mrsTime = time.time()
        #try:
        rec = solrIndexingUtils.recordForSolr(marc4jRecord,
                                              anselUnicodeConverter,
                                              propsObject=props)
        #except:
        #    print "exception processing record, skipping"    # TODO: error handling
        #    continue
        marcRecordForSolrTime += (time.time() - mrsTime)
        extractionTime += rec._extractionTime
        extractorCreateTime += rec._extractorCreateTime
        marcRecordToDictTime += rec._marcRecordToDictTime
        extractMethodTime += rec._extractMethodTime

        if hasattr(rec, "bib_num"):
            recordBatch.append(rec)
            lastBibNum = rec.bib_num
        else:
            print "not adding record %s; no bib_num present!" % rec

        if ((count % SOLR_INDEX_BATCH_SIZE) == 0):
            # nb. neither apache commons nor python urllib works right here!  Unicode gets mangled.
            #Must use postURL

            # fetch the item status info if required.
            if DO_ITEM_STATUS_INDEXING:
                bibs = [x.bib_num for x in recordBatch]
                avail = horizonItemStatus.availableAt(bibs)
                for x in recordBatch:
                    x.available = avail[x.bib_num]

            mrserTime = time.time()
            data = u''.join([x.serialize() for x in recordBatch])
            recordBatch = []
            marcSerializeTime += (time.time() - mrserTime)

            startUpdateTime = time.time()
            try:
                resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                              "<add>%s</add>" % data)

            except IOError:
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
                resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                              "<add>%s</add>" % data)
                # if it fails again here, we want to just bomb out.
            if resp.find('<result status="1"') > -1:
                print "\nError POSTing documents!  Response from Solr was\n\n%s\n" % resp
            # TODO: put in retry/continue code here for failed updates/slowdowns on Solr
            # TODO: parse result status and do something if there is an error (like print stacktrace)
            updateTime += (time.time() - startUpdateTime)
            if pid > -1:
                print("*%d" % pid),
            else:
                print "*",
            if PRINT_SOLR_POST_DATA:
                print "\n\n<add>%s</add>\n\n" % data
            data = ""
        if ((count % SOLR_COMMIT_BATCH_SIZE) == 0):
            try:
                print "committing..."
                beginCommitTime = time.time()
                if nonblocking:
                    print "doing nonblocking commit"
                    solrConnection.commitNonblocking()
                else:
                    solrConnection.commit()
                commitTime += (time.time() - beginCommitTime)
            except IOError:
                import time
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
            if lastCommitTime:
                thisBatchRate = ((0.0 + SOLR_COMMIT_BATCH_SIZE) /
                                 (time.time() - lastCommitTime))
                overallRate = ((0.0 + count) / (time.time() - startTime))
                if pid > -1:
                    print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid  # csdebug
                print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % (
                    time.ctime(), count, thisBatchRate, overallRate)
                if PROFILE:
                    print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % (
                        marcReaderTime, marcRecordForSolrTime,
                        marcSerializeTime)
                    print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % (
                        marcRecordToDictTime, extractorCreateTime,
                        extractionTime, extractMethodTime)
                    print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (
                        updateTime, commitTime)
            lastCommitTime = time.time()
        if ((count % SOLR_OPTIMIZE_BATCH_SIZE) == 0):
            print "[%s] FORCING OPTIMIZE..." % time.ctime()
            solrConnection.optimize()
            print "[%s] OPTIMIZE done" % time.ctime()
            System.gc()
    # do last batch here
    if len(recordBatch) > 0:
        print "doing final POST"
        mrserTime = time.time()
        data = ''.join([x.serialize() for x in recordBatch])
        recordBatch = []

        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
        if resp.find('<result status="1"') > -1:
            print "\nError POSTing documents!  Response from Solr was\n\n%s\n\n" % resp
    print "committing..."
    if nonblocking:
        solrConnection.commitNonblocking()
    else:
        solrConnection.commit()
    inStream.close()
    return count