print "[%s] done processing indexer-queue contents" % time.ctime()
     
     # c. do status changes.
     if doItemStatusIndexing:
         count += horizonIncrementalIndexer.incrementalIndexingJob(commitNonblocking=1)    
     lastRunTime = time.time()
     firstTime = 0
 else:
     print "not time to run yet, last ran %.4f ago" % (now-lastRunTime)
 ## 2nd do optimize if nec.
 now = time.time()
 justOptimized =0
 if optimizeInterval:
     if( (now - lastOptimizeTime) >= optimizeInterval ):
         print "[%s] optimizing" % time.ctime()
         solrConnection.optimize()
         lastOptimizeTime = time.time()
         justOptimized = 1
 if count > optimizeCountInterval and (not justOptimized):
     print "count is %s, forcing OPTIMIZE" % count
     solrConnection.optimize()
     count = 0
 ## 3rd do facet warm if necessary.
 now = time.time()
 if facetWarmInterval:
     if( (now - lastFacetWarmTime) >= facetWarmInterval ):
         print "[%s] facets haven't been warmed in %.4f; warming them" % (time.ctime(), ( now-lastFacetWarmTime) )
         facetWarmer.warmFacets()
         lastFacetWarmTime = time.time()
 ## 4th do complete refresh of yesterday's modified records, if necessary.
 if doItemStatusYesterdayRefresh:
            recordBatch = []
        print ("+%s+" % bibOn) ,
    # now do last batch
    if len(recordBatch) > 0:
        data = u''.join( recordBatch )
        resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data )
    if bibCount > 0:
        print "\n[%s] done updating bibs, now committing" % time.ctime()
        try:
            if commitNonblocking:
                solrConnection.commitNonblocking()
            else:
                solrConnection.commit()
        except IOError:
            print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
            time.sleep(10)
        print "[%s] done committing" % time.ctime()
    else:
        print "[%s] no bibs updated, exiting" % time.ctime()
    return bibCount


if __name__ == '__main__':

    processFilesInDirectory(HORIZON_BASE_DIR)
    # finally, do an optimize here
    if DO_OPTIMIZE:
        print "starting final optimize"
        solrConnection.optimize()    # csdebug
        facetWarmer.warmFacets()
    
Exemple #3
0
            recordBatch = []
        print("+%s+" % bibOn),
    # now do last batch
    if len(recordBatch) > 0:
        data = u''.join(recordBatch)
        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
    if bibCount > 0:
        print "\n[%s] done updating bibs, now committing" % time.ctime()
        try:
            if commitNonblocking:
                solrConnection.commitNonblocking()
            else:
                solrConnection.commit()
        except IOError:
            print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
            time.sleep(10)
        print "[%s] done committing" % time.ctime()
    else:
        print "[%s] no bibs updated, exiting" % time.ctime()
    return bibCount


if __name__ == '__main__':

    processFilesInDirectory(HORIZON_BASE_DIR)
    # finally, do an optimize here
    if DO_OPTIMIZE:
        print "starting final optimize"
        solrConnection.optimize()  # csdebug
        facetWarmer.warmFacets()
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1):
    # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking.
    inStream = FileInputStream(filename)
    print "processFile>> %s" % filename
    marcReader = MarcStreamReader(inStream)
    data = ""
    count = 0
    lastCommitTime = None
    import time

    startTime = time.time()
    lastRecord = None
    lastBibNum = None
    m4j = None
    marcReaderTime = 0
    marcRecordToDictTime = 0
    extractorCreateTime = 0
    extractionTime = 0
    extractMethodTime = 0
    marcRecordForSolrTime = 0
    commitTime = 0
    updateTime = 0
    marcSerializeTime = 0
    accession = 0  # TODO: try and load serialized accession # from somewhere
    serializedRecord = None
    recordBatch = []
    # get default properties file
    from loadPropsFile import *

    props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE)

    while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD:
        # if pid > -1:
        #    print (".%d" % pid),
        # else:
        #    print ".",
        # CSDEBUG
        accession += 1
        count += 1
        # TODO: improve error handling here (main problem is that Marc4J will fall over
        # at the sight of a bad record and there's no way to get it to just skip over
        # a bad record -- so there is little we can do, except better error messages!
        try:
            mrTimeStart = time.time()
            marc4jRecord = marcReader.next()
            marcReaderTime += time.time() - mrTimeStart
        except:
            print "last record indexed was bib# %s " % lastBibNum
            import sys

            print "sys.exc_info is %s" % str(sys.exc_info())
            sys.exit(1)

        mrsTime = time.time()
        # try:
        rec = solrIndexingUtils.recordForSolr(marc4jRecord, anselUnicodeConverter, propsObject=props)
        # except:
        #    print "exception processing record, skipping"    # TODO: error handling
        #    continue
        marcRecordForSolrTime += time.time() - mrsTime
        extractionTime += rec._extractionTime
        extractorCreateTime += rec._extractorCreateTime
        marcRecordToDictTime += rec._marcRecordToDictTime
        extractMethodTime += rec._extractMethodTime

        if hasattr(rec, "bib_num"):
            recordBatch.append(rec)
            lastBibNum = rec.bib_num
        else:
            print "not adding record %s; no bib_num present!" % rec

        if (count % SOLR_INDEX_BATCH_SIZE) == 0:
            # nb. neither apache commons nor python urllib works right here!  Unicode gets mangled.
            # Must use postURL

            # fetch the item status info if required.
            if DO_ITEM_STATUS_INDEXING:
                bibs = [x.bib_num for x in recordBatch]
                avail = horizonItemStatus.availableAt(bibs)
                for x in recordBatch:
                    x.available = avail[x.bib_num]

            mrserTime = time.time()
            data = u"".join([x.serialize() for x in recordBatch])
            recordBatch = []
            marcSerializeTime += time.time() - mrserTime

            startUpdateTime = time.time()
            try:
                resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)

            except IOError:
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
                resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
                # if it fails again here, we want to just bomb out.
            if resp.find('<result status="1"') > -1:
                print "\nError POSTing documents!  Response from Solr was\n\n%s\n" % resp
            # TODO: put in retry/continue code here for failed updates/slowdowns on Solr
            # TODO: parse result status and do something if there is an error (like print stacktrace)
            updateTime += time.time() - startUpdateTime
            if pid > -1:
                print ("*%d" % pid),
            else:
                print "*",
            if PRINT_SOLR_POST_DATA:
                print "\n\n<add>%s</add>\n\n" % data
            data = ""
        if (count % SOLR_COMMIT_BATCH_SIZE) == 0:
            try:
                print "committing..."
                beginCommitTime = time.time()
                if nonblocking:
                    print "doing nonblocking commit"
                    solrConnection.commitNonblocking()
                else:
                    solrConnection.commit()
                commitTime += time.time() - beginCommitTime
            except IOError:
                import time

                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
            if lastCommitTime:
                thisBatchRate = (0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime)
                overallRate = (0.0 + count) / (time.time() - startTime)
                if pid > -1:
                    print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid  # csdebug
                print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % (
                    time.ctime(),
                    count,
                    thisBatchRate,
                    overallRate,
                )
                if PROFILE:
                    print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % (
                        marcReaderTime,
                        marcRecordForSolrTime,
                        marcSerializeTime,
                    )
                    print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % (
                        marcRecordToDictTime,
                        extractorCreateTime,
                        extractionTime,
                        extractMethodTime,
                    )
                    print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (updateTime, commitTime)
            lastCommitTime = time.time()
        if (count % SOLR_OPTIMIZE_BATCH_SIZE) == 0:
            print "[%s] FORCING OPTIMIZE..." % time.ctime()
            solrConnection.optimize()
            print "[%s] OPTIMIZE done" % time.ctime()
            System.gc()
    # do last batch here
    if len(recordBatch) > 0:
        print "doing final POST"
        mrserTime = time.time()
        data = "".join([x.serialize() for x in recordBatch])
        recordBatch = []

        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
        if resp.find('<result status="1"') > -1:
            print "\nError POSTing documents!  Response from Solr was\n\n%s\n\n" % resp
    print "committing..."
    if nonblocking:
        solrConnection.commitNonblocking()
    else:
        solrConnection.commit()
    inStream.close()
    return count
     # c. do status changes.
     if doItemStatusIndexing:
         count += horizonIncrementalIndexer.incrementalIndexingJob(
             commitNonblocking=1)
     lastRunTime = time.time()
     firstTime = 0
 else:
     print "not time to run yet, last ran %.4f ago" % (now -
                                                       lastRunTime)
 ## 2nd do optimize if nec.
 now = time.time()
 justOptimized = 0
 if optimizeInterval:
     if ((now - lastOptimizeTime) >= optimizeInterval):
         print "[%s] optimizing" % time.ctime()
         solrConnection.optimize()
         lastOptimizeTime = time.time()
         justOptimized = 1
 if count > optimizeCountInterval and (not justOptimized):
     print "count is %s, forcing OPTIMIZE" % count
     solrConnection.optimize()
     count = 0
 ## 3rd do facet warm if necessary.
 now = time.time()
 if facetWarmInterval:
     if ((now - lastFacetWarmTime) >= facetWarmInterval):
         print "[%s] facets haven't been warmed in %.4f; warming them" % (
             time.ctime(), (now - lastFacetWarmTime))
         facetWarmer.warmFacets()
         lastFacetWarmTime = time.time()
 ## 4th do complete refresh of yesterday's modified records, if necessary.
Exemple #6
0
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1):
    # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking.
    inStream = FileInputStream(filename)
    print "processFile>> %s" % filename
    marcReader = MarcStreamReader(inStream)
    data = ""
    count = 0
    lastCommitTime = None
    import time
    startTime = time.time()
    lastRecord = None
    lastBibNum = None
    m4j = None
    marcReaderTime = 0
    marcRecordToDictTime = 0
    extractorCreateTime = 0
    extractionTime = 0
    extractMethodTime = 0
    marcRecordForSolrTime = 0
    commitTime = 0
    updateTime = 0
    marcSerializeTime = 0
    accession = 0  # TODO: try and load serialized accession # from somewhere
    serializedRecord = None
    recordBatch = []
    # get default properties file
    from loadPropsFile import *
    props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE)

    while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD:
        #if pid > -1:
        #    print (".%d" % pid),
        #else:
        #    print ".",
        # CSDEBUG
        accession += 1
        count += 1
        # TODO: improve error handling here (main problem is that Marc4J will fall over
        # at the sight of a bad record and there's no way to get it to just skip over
        # a bad record -- so there is little we can do, except better error messages!
        try:
            mrTimeStart = time.time()
            marc4jRecord = marcReader.next()
            marcReaderTime += (time.time() - mrTimeStart)
        except:
            print "last record indexed was bib# %s " % lastBibNum
            import sys
            print "sys.exc_info is %s" % str(sys.exc_info())
            sys.exit(1)

        mrsTime = time.time()
        #try:
        rec = solrIndexingUtils.recordForSolr(marc4jRecord,
                                              anselUnicodeConverter,
                                              propsObject=props)
        #except:
        #    print "exception processing record, skipping"    # TODO: error handling
        #    continue
        marcRecordForSolrTime += (time.time() - mrsTime)
        extractionTime += rec._extractionTime
        extractorCreateTime += rec._extractorCreateTime
        marcRecordToDictTime += rec._marcRecordToDictTime
        extractMethodTime += rec._extractMethodTime

        if hasattr(rec, "bib_num"):
            recordBatch.append(rec)
            lastBibNum = rec.bib_num
        else:
            print "not adding record %s; no bib_num present!" % rec

        if ((count % SOLR_INDEX_BATCH_SIZE) == 0):
            # nb. neither apache commons nor python urllib works right here!  Unicode gets mangled.
            #Must use postURL

            # fetch the item status info if required.
            if DO_ITEM_STATUS_INDEXING:
                bibs = [x.bib_num for x in recordBatch]
                avail = horizonItemStatus.availableAt(bibs)
                for x in recordBatch:
                    x.available = avail[x.bib_num]

            mrserTime = time.time()
            data = u''.join([x.serialize() for x in recordBatch])
            recordBatch = []
            marcSerializeTime += (time.time() - mrserTime)

            startUpdateTime = time.time()
            try:
                resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                              "<add>%s</add>" % data)

            except IOError:
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
                resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                              "<add>%s</add>" % data)
                # if it fails again here, we want to just bomb out.
            if resp.find('<result status="1"') > -1:
                print "\nError POSTing documents!  Response from Solr was\n\n%s\n" % resp
            # TODO: put in retry/continue code here for failed updates/slowdowns on Solr
            # TODO: parse result status and do something if there is an error (like print stacktrace)
            updateTime += (time.time() - startUpdateTime)
            if pid > -1:
                print("*%d" % pid),
            else:
                print "*",
            if PRINT_SOLR_POST_DATA:
                print "\n\n<add>%s</add>\n\n" % data
            data = ""
        if ((count % SOLR_COMMIT_BATCH_SIZE) == 0):
            try:
                print "committing..."
                beginCommitTime = time.time()
                if nonblocking:
                    print "doing nonblocking commit"
                    solrConnection.commitNonblocking()
                else:
                    solrConnection.commit()
                commitTime += (time.time() - beginCommitTime)
            except IOError:
                import time
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
            if lastCommitTime:
                thisBatchRate = ((0.0 + SOLR_COMMIT_BATCH_SIZE) /
                                 (time.time() - lastCommitTime))
                overallRate = ((0.0 + count) / (time.time() - startTime))
                if pid > -1:
                    print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid  # csdebug
                print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % (
                    time.ctime(), count, thisBatchRate, overallRate)
                if PROFILE:
                    print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % (
                        marcReaderTime, marcRecordForSolrTime,
                        marcSerializeTime)
                    print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % (
                        marcRecordToDictTime, extractorCreateTime,
                        extractionTime, extractMethodTime)
                    print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (
                        updateTime, commitTime)
            lastCommitTime = time.time()
        if ((count % SOLR_OPTIMIZE_BATCH_SIZE) == 0):
            print "[%s] FORCING OPTIMIZE..." % time.ctime()
            solrConnection.optimize()
            print "[%s] OPTIMIZE done" % time.ctime()
            System.gc()
    # do last batch here
    if len(recordBatch) > 0:
        print "doing final POST"
        mrserTime = time.time()
        data = ''.join([x.serialize() for x in recordBatch])
        recordBatch = []

        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
        if resp.find('<result status="1"') > -1:
            print "\nError POSTing documents!  Response from Solr was\n\n%s\n\n" % resp
    print "committing..."
    if nonblocking:
        solrConnection.commitNonblocking()
    else:
        solrConnection.commit()
    inStream.close()
    return count