Python commit Examples

Programming Language: Python

Namespace/Package Name: solrConnection

Method/Function: commit

Examples at hotexamples.com: 6

Python commit - 6 examples found. These are the top rated real world Python examples of solrConnection.commit extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def incrementalIndexingJob(commitNonblocking=0):
    """the incremental indexing job scans for changed bibliographic records from Horizon and updates
    them accordingly in Solr."""
    bibsToUpdate = horizonItemStatus.getChangedBibs(doDelete=0)
    print "\n[%s] updating %s bibs" % (time.ctime(), len(bibsToUpdate))
    bibCount = 0
    recordBatch = []
    for bibOn in bibsToUpdate:
        bibCount += 1
        availAt = horizonItemStatus.availableAt(bibOn)
        newRecordOn = horizonItemStatus.updateSolrRecordAvailability(bibOn,
                                                                     availAt,
                                                                     doPost=0)
        recordBatch.append(newRecordOn)
        # now delete item from queue
        horizonItemStatus.deleteFromIndexQueue(bibOn)
        print "-",
        if ((bibCount % SOLR_INDEX_BATCH_SIZE) == 0):
            data = u''.join(recordBatch)
            print "*",
            resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                          "<add>%s</add>" % data)
            recordBatch = []
        print("+%s+" % bibOn),
    # now do last batch
    if len(recordBatch) > 0:
        data = u''.join(recordBatch)
        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
    if bibCount > 0:
        print "\n[%s] done updating bibs, now committing" % time.ctime()
        try:
            if commitNonblocking:
                solrConnection.commitNonblocking()
            else:
                solrConnection.commit()
        except IOError:
            print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
            time.sleep(10)
        print "[%s] done committing" % time.ctime()
    else:
        print "[%s] no bibs updated, exiting" % time.ctime()
    return bibCount

Example #2

Show file

File: horizonIncrementalIndexer.py Project: BGCX067/fac-back-opac-svn-to-git

def incrementalIndexingJob(commitNonblocking=0):
    """the incremental indexing job scans for changed bibliographic records from Horizon and updates
    them accordingly in Solr."""
    bibsToUpdate = horizonItemStatus.getChangedBibs(doDelete=0) 
    print "\n[%s] updating %s bibs" % ( time.ctime(), len(bibsToUpdate) )
    bibCount = 0
    recordBatch = []
    for bibOn in bibsToUpdate:
        bibCount +=1
        availAt = horizonItemStatus.availableAt( bibOn )
        newRecordOn = horizonItemStatus.updateSolrRecordAvailability( bibOn, availAt, doPost = 0)
        recordBatch.append( newRecordOn )
        # now delete item from queue
        horizonItemStatus.deleteFromIndexQueue( bibOn )
        print "-",
        if ( (bibCount % SOLR_INDEX_BATCH_SIZE) == 0):
            data = u''.join( recordBatch )
            print "*",
            resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data )
            recordBatch = []
        print ("+%s+" % bibOn) ,
    # now do last batch
    if len(recordBatch) > 0:
        data = u''.join( recordBatch )
        resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data )
    if bibCount > 0:
        print "\n[%s] done updating bibs, now committing" % time.ctime()
        try:
            if commitNonblocking:
                solrConnection.commitNonblocking()
            else:
                solrConnection.commit()
        except IOError:
            print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
            time.sleep(10)
        print "[%s] done committing" % time.ctime()
    else:
        print "[%s] no bibs updated, exiting" % time.ctime()
    return bibCount

Example #3

Show file

File: horizonIncrementalIndexer.py Project: BGCX067/fac-back-opac-svn-to-git

def processFilesInDirectory(dirName, anselUnicodeConverter = None, commitNonblocking =0, numThreads = 1, deleteAfterIndexing = 1):
    """processes MARC and PMS files in the indexer-queue directory.  If numThreads > 1
    it will try to parallelize the MARC record processing (but not PMS indexing -- no reason for that)     
    """
    pmsFiles = glob.glob( "%s/PMS*.DAT" % dirName )
    updatedAnyRecords = 0
    count= 0
    for fileOn in pmsFiles:
        print "processing PMS file %s" % fileOn
        processedFilenameOn, deletedBibsOn = processPMSFile(fileOn)
        if processedFilenameOn:
            print "processing MARC file %s" % processedFilenameOn
            indexerDriver.processFile( processedFilenameOn, anselUnicodeConverter )
            
            # now that we are done processing the file, we delete it.
            print "deleting MARC file %s " % processedFilenameOn
            os.remove( processedFilenameOn )
            print "deleting PMS file %s" % fileOn
            os.remove( fileOn )
            
            if deletedBibsOn:
                print "processing deleted bibs from MARC file %s" % processedFilenameOn
                for bibOn in deletedBibsOn:
                    print "deleting bib %s" % bibOn
                    indexerDriver.deleteRecord( bibOn )
            updatedAnyRecords = 1
        else:
            print "no records to index"
            os.remove( fileOn )

    print "[%s] now checking for MARC files" % time.ctime()
    _marcFiles = glob.glob( "%s/*.MARC" % dirName)
    _marcFiles += glob.glob ("%s/*.marc" % dirName )
    _marcFiles += glob.glob ("%s/*.dat" % dirName )
    _marcFiles += glob.glob ("%s/*.DAT" % dirName )
    _marcFiles += glob.glob( "%s/*scriblio*" % dirName )
    # dedupe _marcFiles here incase a file matches more than one glob
    # using a dictionary is the fastest way to dedupe a list with Jython
    marcFileDict = {}
    for fileOn in _marcFiles:
        marcFileDict[fileOn] = None
    marcFiles = marcFileDict.keys()
    marcFiles.sort()
    
    numMarcFiles = len(marcFiles)
    print "[%s] found %d files to process." % (time.ctime(), numMarcFiles )
    if numThreads == 1:
        for fileOn in marcFiles:
            print "processing MARC file %s" % fileOn
            count = indexerDriver.processFile( fileOn, anselUnicodeConverter, nonblocking=1 ) # csdebug: added nonblocking here
            updatedAnyRecords = 1
            if deleteAfterIndexing:
                os.remove( fileOn )
    elif numThreads >= numMarcFiles:
        # spin off a thread for each one
        # was getting weird problems with multithreading (AttributeErrors when trying to iterate
        # over all controlFields in MARC record -- trying separate anselUnicodeConverters to see if that's the issue.
        threads = []
        threadrefs = []
        i = 0
        for fileOn in marcFiles:
            convOn = AnselToUnicode()
            jobOn = indexerDriver.processFileJob( fileOn, convOn, nonblocking = 1, pid = i)# csdebug: handle nonblocking option
            _threadOn = Thread( jobOn, "process file job %s" %i )
            threads.append( _threadOn )
            threadrefs.append( jobOn )
            print "starting thread %s processing file %s" % (i, fileOn)
            _threadOn.start() 
            i += 1
            updatedAnyRecords = 1
        print "joining threads"
        for i in range( len(threads) ):
            threads[i].join()
            # TODO: make sure the thread was successful before nuking.
            if deleteAfterIndexing:
                print "deleting %s" % threadrefs[i].filename
                os.remove( threadrefs[i].filename )
            
    else:
        # do work queue here.
        print "not yet implemented"
    # finally, do a commit here.
    if updatedAnyRecords:
        print "[%s] starting final commit" % time.ctime()
        if commitNonblocking:
            solrConnection.commitNonblocking()
        else:
            solrConnection.commit()
        print "[%s] done committing" % time.ctime()
    return count

Example #4

Show file

def processFilesInDirectory(dirName,
                            anselUnicodeConverter=None,
                            commitNonblocking=0,
                            numThreads=1,
                            deleteAfterIndexing=1):
    """processes MARC and PMS files in the indexer-queue directory.  If numThreads > 1
    it will try to parallelize the MARC record processing (but not PMS indexing -- no reason for that)     
    """
    pmsFiles = glob.glob("%s/PMS*.DAT" % dirName)
    updatedAnyRecords = 0
    count = 0
    for fileOn in pmsFiles:
        print "processing PMS file %s" % fileOn
        processedFilenameOn, deletedBibsOn = processPMSFile(fileOn)
        if processedFilenameOn:
            print "processing MARC file %s" % processedFilenameOn
            indexerDriver.processFile(processedFilenameOn,
                                      anselUnicodeConverter)

            # now that we are done processing the file, we delete it.
            print "deleting MARC file %s " % processedFilenameOn
            os.remove(processedFilenameOn)
            print "deleting PMS file %s" % fileOn
            os.remove(fileOn)

            if deletedBibsOn:
                print "processing deleted bibs from MARC file %s" % processedFilenameOn
                for bibOn in deletedBibsOn:
                    print "deleting bib %s" % bibOn
                    indexerDriver.deleteRecord(bibOn)
            updatedAnyRecords = 1
        else:
            print "no records to index"
            os.remove(fileOn)

    print "[%s] now checking for MARC files" % time.ctime()
    _marcFiles = glob.glob("%s/*.MARC" % dirName)
    _marcFiles += glob.glob("%s/*.marc" % dirName)
    _marcFiles += glob.glob("%s/*.dat" % dirName)
    _marcFiles += glob.glob("%s/*.DAT" % dirName)
    _marcFiles += glob.glob("%s/*scriblio*" % dirName)
    # dedupe _marcFiles here incase a file matches more than one glob
    # using a dictionary is the fastest way to dedupe a list with Jython
    marcFileDict = {}
    for fileOn in _marcFiles:
        marcFileDict[fileOn] = None
    marcFiles = marcFileDict.keys()
    marcFiles.sort()

    numMarcFiles = len(marcFiles)
    print "[%s] found %d files to process." % (time.ctime(), numMarcFiles)
    if numThreads == 1:
        for fileOn in marcFiles:
            print "processing MARC file %s" % fileOn
            count = indexerDriver.processFile(
                fileOn, anselUnicodeConverter,
                nonblocking=1)  # csdebug: added nonblocking here
            updatedAnyRecords = 1
            if deleteAfterIndexing:
                os.remove(fileOn)
    elif numThreads >= numMarcFiles:
        # spin off a thread for each one
        # was getting weird problems with multithreading (AttributeErrors when trying to iterate
        # over all controlFields in MARC record -- trying separate anselUnicodeConverters to see if that's the issue.
        threads = []
        threadrefs = []
        i = 0
        for fileOn in marcFiles:
            convOn = AnselToUnicode()
            jobOn = indexerDriver.processFileJob(
                fileOn, convOn, nonblocking=1,
                pid=i)  # csdebug: handle nonblocking option
            _threadOn = Thread(jobOn, "process file job %s" % i)
            threads.append(_threadOn)
            threadrefs.append(jobOn)
            print "starting thread %s processing file %s" % (i, fileOn)
            _threadOn.start()
            i += 1
            updatedAnyRecords = 1
        print "joining threads"
        for i in range(len(threads)):
            threads[i].join()
            # TODO: make sure the thread was successful before nuking.
            if deleteAfterIndexing:
                print "deleting %s" % threadrefs[i].filename
                os.remove(threadrefs[i].filename)

    else:
        # do work queue here.
        print "not yet implemented"
    # finally, do a commit here.
    if updatedAnyRecords:
        print "[%s] starting final commit" % time.ctime()
        if commitNonblocking:
            solrConnection.commitNonblocking()
        else:
            solrConnection.commit()
        print "[%s] done committing" % time.ctime()
    return count

Example #5

Show file

File: indexerDriver.py Project: BGCX067/fac-back-opac-svn-to-git

def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1):
    # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking.
    inStream = FileInputStream(filename)
    print "processFile>> %s" % filename
    marcReader = MarcStreamReader(inStream)
    data = ""
    count = 0
    lastCommitTime = None
    import time

    startTime = time.time()
    lastRecord = None
    lastBibNum = None
    m4j = None
    marcReaderTime = 0
    marcRecordToDictTime = 0
    extractorCreateTime = 0
    extractionTime = 0
    extractMethodTime = 0
    marcRecordForSolrTime = 0
    commitTime = 0
    updateTime = 0
    marcSerializeTime = 0
    accession = 0  # TODO: try and load serialized accession # from somewhere
    serializedRecord = None
    recordBatch = []
    # get default properties file
    from loadPropsFile import *

    props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE)

    while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD:
        # if pid > -1:
        #    print (".%d" % pid),
        # else:
        #    print ".",
        # CSDEBUG
        accession += 1
        count += 1
        # TODO: improve error handling here (main problem is that Marc4J will fall over
        # at the sight of a bad record and there's no way to get it to just skip over
        # a bad record -- so there is little we can do, except better error messages!
        try:
            mrTimeStart = time.time()
            marc4jRecord = marcReader.next()
            marcReaderTime += time.time() - mrTimeStart
        except:
            print "last record indexed was bib# %s " % lastBibNum
            import sys

            print "sys.exc_info is %s" % str(sys.exc_info())
            sys.exit(1)

        mrsTime = time.time()
        # try:
        rec = solrIndexingUtils.recordForSolr(marc4jRecord, anselUnicodeConverter, propsObject=props)
        # except:
        #    print "exception processing record, skipping"    # TODO: error handling
        #    continue
        marcRecordForSolrTime += time.time() - mrsTime
        extractionTime += rec._extractionTime
        extractorCreateTime += rec._extractorCreateTime
        marcRecordToDictTime += rec._marcRecordToDictTime
        extractMethodTime += rec._extractMethodTime

        if hasattr(rec, "bib_num"):
            recordBatch.append(rec)
            lastBibNum = rec.bib_num
        else:
            print "not adding record %s; no bib_num present!" % rec

        if (count % SOLR_INDEX_BATCH_SIZE) == 0:
            # nb. neither apache commons nor python urllib works right here!  Unicode gets mangled.
            # Must use postURL

            # fetch the item status info if required.
            if DO_ITEM_STATUS_INDEXING:
                bibs = [x.bib_num for x in recordBatch]
                avail = horizonItemStatus.availableAt(bibs)
                for x in recordBatch:
                    x.available = avail[x.bib_num]

            mrserTime = time.time()
            data = u"".join([x.serialize() for x in recordBatch])
            recordBatch = []
            marcSerializeTime += time.time() - mrserTime

            startUpdateTime = time.time()
            try:
                resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)

            except IOError:
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
                resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
                # if it fails again here, we want to just bomb out.
            if resp.find('<result status="1"') > -1:
                print "\nError POSTing documents!  Response from Solr was\n\n%s\n" % resp
            # TODO: put in retry/continue code here for failed updates/slowdowns on Solr
            # TODO: parse result status and do something if there is an error (like print stacktrace)
            updateTime += time.time() - startUpdateTime
            if pid > -1:
                print ("*%d" % pid),
            else:
                print "*",
            if PRINT_SOLR_POST_DATA:
                print "\n\n<add>%s</add>\n\n" % data
            data = ""
        if (count % SOLR_COMMIT_BATCH_SIZE) == 0:
            try:
                print "committing..."
                beginCommitTime = time.time()
                if nonblocking:
                    print "doing nonblocking commit"
                    solrConnection.commitNonblocking()
                else:
                    solrConnection.commit()
                commitTime += time.time() - beginCommitTime
            except IOError:
                import time

                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
            if lastCommitTime:
                thisBatchRate = (0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime)
                overallRate = (0.0 + count) / (time.time() - startTime)
                if pid > -1:
                    print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid  # csdebug
                print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % (
                    time.ctime(),
                    count,
                    thisBatchRate,
                    overallRate,
                )
                if PROFILE:
                    print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % (
                        marcReaderTime,
                        marcRecordForSolrTime,
                        marcSerializeTime,
                    )
                    print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % (
                        marcRecordToDictTime,
                        extractorCreateTime,
                        extractionTime,
                        extractMethodTime,
                    )
                    print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (updateTime, commitTime)
            lastCommitTime = time.time()
        if (count % SOLR_OPTIMIZE_BATCH_SIZE) == 0:
            print "[%s] FORCING OPTIMIZE..." % time.ctime()
            solrConnection.optimize()
            print "[%s] OPTIMIZE done" % time.ctime()
            System.gc()
    # do last batch here
    if len(recordBatch) > 0:
        print "doing final POST"
        mrserTime = time.time()
        data = "".join([x.serialize() for x in recordBatch])
        recordBatch = []

        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
        if resp.find('<result status="1"') > -1:
            print "\nError POSTing documents!  Response from Solr was\n\n%s\n\n" % resp
    print "committing..."
    if nonblocking:
        solrConnection.commitNonblocking()
    else:
        solrConnection.commit()
    inStream.close()
    return count

Example #6

Show file

def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1):
    # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking.
    inStream = FileInputStream(filename)
    print "processFile>> %s" % filename
    marcReader = MarcStreamReader(inStream)
    data = ""
    count = 0
    lastCommitTime = None
    import time
    startTime = time.time()
    lastRecord = None
    lastBibNum = None
    m4j = None
    marcReaderTime = 0
    marcRecordToDictTime = 0
    extractorCreateTime = 0
    extractionTime = 0
    extractMethodTime = 0
    marcRecordForSolrTime = 0
    commitTime = 0
    updateTime = 0
    marcSerializeTime = 0
    accession = 0  # TODO: try and load serialized accession # from somewhere
    serializedRecord = None
    recordBatch = []
    # get default properties file
    from loadPropsFile import *
    props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE)

    while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD:
        #if pid > -1:
        #    print (".%d" % pid),
        #else:
        #    print ".",
        # CSDEBUG
        accession += 1
        count += 1
        # TODO: improve error handling here (main problem is that Marc4J will fall over
        # at the sight of a bad record and there's no way to get it to just skip over
        # a bad record -- so there is little we can do, except better error messages!
        try:
            mrTimeStart = time.time()
            marc4jRecord = marcReader.next()
            marcReaderTime += (time.time() - mrTimeStart)
        except:
            print "last record indexed was bib# %s " % lastBibNum
            import sys
            print "sys.exc_info is %s" % str(sys.exc_info())
            sys.exit(1)

        mrsTime = time.time()
        #try:
        rec = solrIndexingUtils.recordForSolr(marc4jRecord,
                                              anselUnicodeConverter,
                                              propsObject=props)
        #except:
        #    print "exception processing record, skipping"    # TODO: error handling
        #    continue
        marcRecordForSolrTime += (time.time() - mrsTime)
        extractionTime += rec._extractionTime
        extractorCreateTime += rec._extractorCreateTime
        marcRecordToDictTime += rec._marcRecordToDictTime
        extractMethodTime += rec._extractMethodTime

        if hasattr(rec, "bib_num"):
            recordBatch.append(rec)
            lastBibNum = rec.bib_num
        else:
            print "not adding record %s; no bib_num present!" % rec

        if ((count % SOLR_INDEX_BATCH_SIZE) == 0):
            # nb. neither apache commons nor python urllib works right here!  Unicode gets mangled.
            #Must use postURL

            # fetch the item status info if required.
            if DO_ITEM_STATUS_INDEXING:
                bibs = [x.bib_num for x in recordBatch]
                avail = horizonItemStatus.availableAt(bibs)
                for x in recordBatch:
                    x.available = avail[x.bib_num]

            mrserTime = time.time()
            data = u''.join([x.serialize() for x in recordBatch])
            recordBatch = []
            marcSerializeTime += (time.time() - mrserTime)

            startUpdateTime = time.time()
            try:
                resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                              "<add>%s</add>" % data)

            except IOError:
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
                resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                              "<add>%s</add>" % data)
                # if it fails again here, we want to just bomb out.
            if resp.find('<result status="1"') > -1:
                print "\nError POSTing documents!  Response from Solr was\n\n%s\n" % resp
            # TODO: put in retry/continue code here for failed updates/slowdowns on Solr
            # TODO: parse result status and do something if there is an error (like print stacktrace)
            updateTime += (time.time() - startUpdateTime)
            if pid > -1:
                print("*%d" % pid),
            else:
                print "*",
            if PRINT_SOLR_POST_DATA:
                print "\n\n<add>%s</add>\n\n" % data
            data = ""
        if ((count % SOLR_COMMIT_BATCH_SIZE) == 0):
            try:
                print "committing..."
                beginCommitTime = time.time()
                if nonblocking:
                    print "doing nonblocking commit"
                    solrConnection.commitNonblocking()
                else:
                    solrConnection.commit()
                commitTime += (time.time() - beginCommitTime)
            except IOError:
                import time
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
            if lastCommitTime:
                thisBatchRate = ((0.0 + SOLR_COMMIT_BATCH_SIZE) /
                                 (time.time() - lastCommitTime))
                overallRate = ((0.0 + count) / (time.time() - startTime))
                if pid > -1:
                    print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid  # csdebug
                print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % (
                    time.ctime(), count, thisBatchRate, overallRate)
                if PROFILE:
                    print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % (
                        marcReaderTime, marcRecordForSolrTime,
                        marcSerializeTime)
                    print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % (
                        marcRecordToDictTime, extractorCreateTime,
                        extractionTime, extractMethodTime)
                    print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (
                        updateTime, commitTime)
            lastCommitTime = time.time()
        if ((count % SOLR_OPTIMIZE_BATCH_SIZE) == 0):
            print "[%s] FORCING OPTIMIZE..." % time.ctime()
            solrConnection.optimize()
            print "[%s] OPTIMIZE done" % time.ctime()
            System.gc()
    # do last batch here
    if len(recordBatch) > 0:
        print "doing final POST"
        mrserTime = time.time()
        data = ''.join([x.serialize() for x in recordBatch])
        recordBatch = []

        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
        if resp.find('<result status="1"') > -1:
            print "\nError POSTing documents!  Response from Solr was\n\n%s\n\n" % resp
    print "committing..."
    if nonblocking:
        solrConnection.commitNonblocking()
    else:
        solrConnection.commit()
    inStream.close()
    return count