Ejemplo n.º 1
0
def incrementalIndexingJob(commitNonblocking=0):
    """the incremental indexing job scans for changed bibliographic records from Horizon and updates
    them accordingly in Solr."""
    bibsToUpdate = horizonItemStatus.getChangedBibs(doDelete=0)
    print "\n[%s] updating %s bibs" % (time.ctime(), len(bibsToUpdate))
    bibCount = 0
    recordBatch = []
    for bibOn in bibsToUpdate:
        bibCount += 1
        availAt = horizonItemStatus.availableAt(bibOn)
        newRecordOn = horizonItemStatus.updateSolrRecordAvailability(bibOn,
                                                                     availAt,
                                                                     doPost=0)
        recordBatch.append(newRecordOn)
        # now delete item from queue
        horizonItemStatus.deleteFromIndexQueue(bibOn)
        print "-",
        if ((bibCount % SOLR_INDEX_BATCH_SIZE) == 0):
            data = u''.join(recordBatch)
            print "*",
            resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                          "<add>%s</add>" % data)
            recordBatch = []
        print("+%s+" % bibOn),
    # now do last batch
    if len(recordBatch) > 0:
        data = u''.join(recordBatch)
        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
    if bibCount > 0:
        print "\n[%s] done updating bibs, now committing" % time.ctime()
        try:
            if commitNonblocking:
                solrConnection.commitNonblocking()
            else:
                solrConnection.commit()
        except IOError:
            print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
            time.sleep(10)
        print "[%s] done committing" % time.ctime()
    else:
        print "[%s] no bibs updated, exiting" % time.ctime()
    return bibCount
def incrementalIndexingJob(commitNonblocking=0):
    """the incremental indexing job scans for changed bibliographic records from Horizon and updates
    them accordingly in Solr."""
    bibsToUpdate = horizonItemStatus.getChangedBibs(doDelete=0) 
    print "\n[%s] updating %s bibs" % ( time.ctime(), len(bibsToUpdate) )
    bibCount = 0
    recordBatch = []
    for bibOn in bibsToUpdate:
        bibCount +=1
        availAt = horizonItemStatus.availableAt( bibOn )
        newRecordOn = horizonItemStatus.updateSolrRecordAvailability( bibOn, availAt, doPost = 0)
        recordBatch.append( newRecordOn )
        # now delete item from queue
        horizonItemStatus.deleteFromIndexQueue( bibOn )
        print "-",
        if ( (bibCount % SOLR_INDEX_BATCH_SIZE) == 0):
            data = u''.join( recordBatch )
            print "*",
            resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data )
            recordBatch = []
        print ("+%s+" % bibOn) ,
    # now do last batch
    if len(recordBatch) > 0:
        data = u''.join( recordBatch )
        resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data )
    if bibCount > 0:
        print "\n[%s] done updating bibs, now committing" % time.ctime()
        try:
            if commitNonblocking:
                solrConnection.commitNonblocking()
            else:
                solrConnection.commit()
        except IOError:
            print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
            time.sleep(10)
        print "[%s] done committing" % time.ctime()
    else:
        print "[%s] no bibs updated, exiting" % time.ctime()
    return bibCount
def updateSolrRecordAvailability(bibNum, availableLocations=[], doPost=1):
    """this function updates an already-indexed record from Solr with new location information.
    It does this by grabbing the record in python format, evaling it as a python object, updating
    that one attribute and then re-serializing it as XML.  WHEW/UGH!  The method is so convoluted
    because Lucene does not allow you to update just one field in an indexed document -- you must
    delete the whole document and re-add it.
    """
    urlToGet = "%s?q=bib_num:%s&wt=python" % (SOLR_QUERY_URL, bibNum)
    # TODO: better error handling
    try:
        u = urllib.urlopen(urlToGet)
        data = u.read()
        u.close()
    except IOError:
        # connections sometimes reset; sleep a little while and try again -- if fails again, bomb out with error.
        print "IOError while trying to get URL %s" % urlToGet
        time.sleep(1.0)
        u = urllib.urlopen(urlToGet)
        data = u.read()
        u.close()
    try:
        evData = eval(data)
    except:
        print "error while trying to eval record for %s" % bibNum
    try:
        docOn = evData['response']['docs'][0]
    except:  # some error was reached; just toss it back
        return data
    docOn['available'] = availableLocations

    # now serialize the updated doc as XML
    ret = u""
    for kOn in docOn.keys():
        valueOn = docOn[kOn]
        if type(valueOn) == type([]):
            for subvalueOn in valueOn:
                sv = str(subvalueOn).replace("&", "&amp;").replace(
                    "<", "&amp;lt;").replace(">", "&amp;gt;")
                ret += u"""<field name="%s">%s</field>""" % (kOn, sv)
        else:
            val = str(valueOn).replace("&", "&amp;").replace(
                "<", "&amp;lt;").replace(">", "&amp;gt;")
            ret += u"""<field name="%s">%s</field>""" % (kOn, val)

    # now POST update if required
    if doPost:
        resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                      "<add><doc>%s</doc></add>" % ret)
        return resp
    else:
        return "<doc>%s</doc>" % ret
def updateSolrRecordAvailability( bibNum, availableLocations = [], doPost =1 ):
    """this function updates an already-indexed record from Solr with new location information.
    It does this by grabbing the record in python format, evaling it as a python object, updating
    that one attribute and then re-serializing it as XML.  WHEW/UGH!  The method is so convoluted
    because Lucene does not allow you to update just one field in an indexed document -- you must
    delete the whole document and re-add it.
    """
    urlToGet = "%s?q=bib_num:%s&wt=python" % ( SOLR_QUERY_URL, bibNum )
    # TODO: better error handling
    try:
        u = urllib.urlopen(urlToGet)
        data = u.read()
        u.close()
    except IOError:
        # connections sometimes reset; sleep a little while and try again -- if fails again, bomb out with error.
        print "IOError while trying to get URL %s" % urlToGet
        time.sleep(1.0)
        u = urllib.urlopen(urlToGet)
        data = u.read()
        u.close()
    try:
        evData = eval(data)
    except:
        print "error while trying to eval record for %s" % bibNum 
    try:
        docOn = evData['response']['docs'][0]
    except:    # some error was reached; just toss it back
        return data    
    docOn['available'] = availableLocations
    
    # now serialize the updated doc as XML
    ret = u""
    for kOn in docOn.keys():
        valueOn = docOn[kOn]
        if type(valueOn) == type([]):
            for subvalueOn in valueOn:
                sv = str(subvalueOn).replace("&", "&amp;").replace("<", "&amp;lt;").replace(">", "&amp;gt;")
                ret  += u"""<field name="%s">%s</field>""" % ( kOn, sv )
        else:
            val = str(valueOn).replace("&", "&amp;").replace("<", "&amp;lt;").replace(">", "&amp;gt;")
            ret += u"""<field name="%s">%s</field>""" % ( kOn, val )
    
    # now POST update if required
    if doPost:
        resp = solrConnection.postURL( SOLR_UPDATE_URL, "<add><doc>%s</doc></add>" % ret )
        return resp
    else:
        return "<doc>%s</doc>" % ret
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1):
    # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking.
    inStream = FileInputStream(filename)
    print "processFile>> %s" % filename
    marcReader = MarcStreamReader(inStream)
    data = ""
    count = 0
    lastCommitTime = None
    import time

    startTime = time.time()
    lastRecord = None
    lastBibNum = None
    m4j = None
    marcReaderTime = 0
    marcRecordToDictTime = 0
    extractorCreateTime = 0
    extractionTime = 0
    extractMethodTime = 0
    marcRecordForSolrTime = 0
    commitTime = 0
    updateTime = 0
    marcSerializeTime = 0
    accession = 0  # TODO: try and load serialized accession # from somewhere
    serializedRecord = None
    recordBatch = []
    # get default properties file
    from loadPropsFile import *

    props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE)

    while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD:
        # if pid > -1:
        #    print (".%d" % pid),
        # else:
        #    print ".",
        # CSDEBUG
        accession += 1
        count += 1
        # TODO: improve error handling here (main problem is that Marc4J will fall over
        # at the sight of a bad record and there's no way to get it to just skip over
        # a bad record -- so there is little we can do, except better error messages!
        try:
            mrTimeStart = time.time()
            marc4jRecord = marcReader.next()
            marcReaderTime += time.time() - mrTimeStart
        except:
            print "last record indexed was bib# %s " % lastBibNum
            import sys

            print "sys.exc_info is %s" % str(sys.exc_info())
            sys.exit(1)

        mrsTime = time.time()
        # try:
        rec = solrIndexingUtils.recordForSolr(marc4jRecord, anselUnicodeConverter, propsObject=props)
        # except:
        #    print "exception processing record, skipping"    # TODO: error handling
        #    continue
        marcRecordForSolrTime += time.time() - mrsTime
        extractionTime += rec._extractionTime
        extractorCreateTime += rec._extractorCreateTime
        marcRecordToDictTime += rec._marcRecordToDictTime
        extractMethodTime += rec._extractMethodTime

        if hasattr(rec, "bib_num"):
            recordBatch.append(rec)
            lastBibNum = rec.bib_num
        else:
            print "not adding record %s; no bib_num present!" % rec

        if (count % SOLR_INDEX_BATCH_SIZE) == 0:
            # nb. neither apache commons nor python urllib works right here!  Unicode gets mangled.
            # Must use postURL

            # fetch the item status info if required.
            if DO_ITEM_STATUS_INDEXING:
                bibs = [x.bib_num for x in recordBatch]
                avail = horizonItemStatus.availableAt(bibs)
                for x in recordBatch:
                    x.available = avail[x.bib_num]

            mrserTime = time.time()
            data = u"".join([x.serialize() for x in recordBatch])
            recordBatch = []
            marcSerializeTime += time.time() - mrserTime

            startUpdateTime = time.time()
            try:
                resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)

            except IOError:
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
                resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
                # if it fails again here, we want to just bomb out.
            if resp.find('<result status="1"') > -1:
                print "\nError POSTing documents!  Response from Solr was\n\n%s\n" % resp
            # TODO: put in retry/continue code here for failed updates/slowdowns on Solr
            # TODO: parse result status and do something if there is an error (like print stacktrace)
            updateTime += time.time() - startUpdateTime
            if pid > -1:
                print ("*%d" % pid),
            else:
                print "*",
            if PRINT_SOLR_POST_DATA:
                print "\n\n<add>%s</add>\n\n" % data
            data = ""
        if (count % SOLR_COMMIT_BATCH_SIZE) == 0:
            try:
                print "committing..."
                beginCommitTime = time.time()
                if nonblocking:
                    print "doing nonblocking commit"
                    solrConnection.commitNonblocking()
                else:
                    solrConnection.commit()
                commitTime += time.time() - beginCommitTime
            except IOError:
                import time

                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
            if lastCommitTime:
                thisBatchRate = (0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime)
                overallRate = (0.0 + count) / (time.time() - startTime)
                if pid > -1:
                    print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid  # csdebug
                print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % (
                    time.ctime(),
                    count,
                    thisBatchRate,
                    overallRate,
                )
                if PROFILE:
                    print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % (
                        marcReaderTime,
                        marcRecordForSolrTime,
                        marcSerializeTime,
                    )
                    print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % (
                        marcRecordToDictTime,
                        extractorCreateTime,
                        extractionTime,
                        extractMethodTime,
                    )
                    print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (updateTime, commitTime)
            lastCommitTime = time.time()
        if (count % SOLR_OPTIMIZE_BATCH_SIZE) == 0:
            print "[%s] FORCING OPTIMIZE..." % time.ctime()
            solrConnection.optimize()
            print "[%s] OPTIMIZE done" % time.ctime()
            System.gc()
    # do last batch here
    if len(recordBatch) > 0:
        print "doing final POST"
        mrserTime = time.time()
        data = "".join([x.serialize() for x in recordBatch])
        recordBatch = []

        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
        if resp.find('<result status="1"') > -1:
            print "\nError POSTing documents!  Response from Solr was\n\n%s\n\n" % resp
    print "committing..."
    if nonblocking:
        solrConnection.commitNonblocking()
    else:
        solrConnection.commit()
    inStream.close()
    return count
Ejemplo n.º 6
0
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1):
    # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking.
    inStream = FileInputStream(filename)
    print "processFile>> %s" % filename
    marcReader = MarcStreamReader(inStream)
    data = ""
    count = 0
    lastCommitTime = None
    import time
    startTime = time.time()
    lastRecord = None
    lastBibNum = None
    m4j = None
    marcReaderTime = 0
    marcRecordToDictTime = 0
    extractorCreateTime = 0
    extractionTime = 0
    extractMethodTime = 0
    marcRecordForSolrTime = 0
    commitTime = 0
    updateTime = 0
    marcSerializeTime = 0
    accession = 0  # TODO: try and load serialized accession # from somewhere
    serializedRecord = None
    recordBatch = []
    # get default properties file
    from loadPropsFile import *
    props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE)

    while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD:
        #if pid > -1:
        #    print (".%d" % pid),
        #else:
        #    print ".",
        # CSDEBUG
        accession += 1
        count += 1
        # TODO: improve error handling here (main problem is that Marc4J will fall over
        # at the sight of a bad record and there's no way to get it to just skip over
        # a bad record -- so there is little we can do, except better error messages!
        try:
            mrTimeStart = time.time()
            marc4jRecord = marcReader.next()
            marcReaderTime += (time.time() - mrTimeStart)
        except:
            print "last record indexed was bib# %s " % lastBibNum
            import sys
            print "sys.exc_info is %s" % str(sys.exc_info())
            sys.exit(1)

        mrsTime = time.time()
        #try:
        rec = solrIndexingUtils.recordForSolr(marc4jRecord,
                                              anselUnicodeConverter,
                                              propsObject=props)
        #except:
        #    print "exception processing record, skipping"    # TODO: error handling
        #    continue
        marcRecordForSolrTime += (time.time() - mrsTime)
        extractionTime += rec._extractionTime
        extractorCreateTime += rec._extractorCreateTime
        marcRecordToDictTime += rec._marcRecordToDictTime
        extractMethodTime += rec._extractMethodTime

        if hasattr(rec, "bib_num"):
            recordBatch.append(rec)
            lastBibNum = rec.bib_num
        else:
            print "not adding record %s; no bib_num present!" % rec

        if ((count % SOLR_INDEX_BATCH_SIZE) == 0):
            # nb. neither apache commons nor python urllib works right here!  Unicode gets mangled.
            #Must use postURL

            # fetch the item status info if required.
            if DO_ITEM_STATUS_INDEXING:
                bibs = [x.bib_num for x in recordBatch]
                avail = horizonItemStatus.availableAt(bibs)
                for x in recordBatch:
                    x.available = avail[x.bib_num]

            mrserTime = time.time()
            data = u''.join([x.serialize() for x in recordBatch])
            recordBatch = []
            marcSerializeTime += (time.time() - mrserTime)

            startUpdateTime = time.time()
            try:
                resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                              "<add>%s</add>" % data)

            except IOError:
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
                resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                              "<add>%s</add>" % data)
                # if it fails again here, we want to just bomb out.
            if resp.find('<result status="1"') > -1:
                print "\nError POSTing documents!  Response from Solr was\n\n%s\n" % resp
            # TODO: put in retry/continue code here for failed updates/slowdowns on Solr
            # TODO: parse result status and do something if there is an error (like print stacktrace)
            updateTime += (time.time() - startUpdateTime)
            if pid > -1:
                print("*%d" % pid),
            else:
                print "*",
            if PRINT_SOLR_POST_DATA:
                print "\n\n<add>%s</add>\n\n" % data
            data = ""
        if ((count % SOLR_COMMIT_BATCH_SIZE) == 0):
            try:
                print "committing..."
                beginCommitTime = time.time()
                if nonblocking:
                    print "doing nonblocking commit"
                    solrConnection.commitNonblocking()
                else:
                    solrConnection.commit()
                commitTime += (time.time() - beginCommitTime)
            except IOError:
                import time
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
            if lastCommitTime:
                thisBatchRate = ((0.0 + SOLR_COMMIT_BATCH_SIZE) /
                                 (time.time() - lastCommitTime))
                overallRate = ((0.0 + count) / (time.time() - startTime))
                if pid > -1:
                    print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid  # csdebug
                print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % (
                    time.ctime(), count, thisBatchRate, overallRate)
                if PROFILE:
                    print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % (
                        marcReaderTime, marcRecordForSolrTime,
                        marcSerializeTime)
                    print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % (
                        marcRecordToDictTime, extractorCreateTime,
                        extractionTime, extractMethodTime)
                    print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (
                        updateTime, commitTime)
            lastCommitTime = time.time()
        if ((count % SOLR_OPTIMIZE_BATCH_SIZE) == 0):
            print "[%s] FORCING OPTIMIZE..." % time.ctime()
            solrConnection.optimize()
            print "[%s] OPTIMIZE done" % time.ctime()
            System.gc()
    # do last batch here
    if len(recordBatch) > 0:
        print "doing final POST"
        mrserTime = time.time()
        data = ''.join([x.serialize() for x in recordBatch])
        recordBatch = []

        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
        if resp.find('<result status="1"') > -1:
            print "\nError POSTing documents!  Response from Solr was\n\n%s\n\n" % resp
    print "committing..."
    if nonblocking:
        solrConnection.commitNonblocking()
    else:
        solrConnection.commit()
    inStream.close()
    return count