Example #1
0
    def __init__(self, marcRecord, anselUnicodeConverter = None, accession = None, config = None, indexerProperties="config/indexes.properties"):
        start = time.time()
        self._marcRecordToDictTime = 0
        self._extractionTime = 0
        self._extractorCreateTime = 0
	self.bib_num = None
        
        if anselUnicodeConverter is None: 
            anselUnicodeConverter = AnselToUnicode()
        record = marcRecordToDict( marcRecord, anselUnicodeConverter )
        self._marcRecordToDictTime = time.time() - start
        
        start = time.time()
        
        if config is None:
            config = loadPropsFile(indexerProperties)
        extractor = MarcExtractor( record )
        self._extractorCreateTime = (time.time() - start)
        
        self.marc_record = str( record )
        
        fieldsToDo = [x.strip() for x in config['active.fields'].split(",")]
        _processors = __import__('processors', {},{},[''])
        for fieldOn in fieldsToDo:
            start = time.time()
            processorNameOn = config.get( "%s.processor" % fieldOn, "standardProcessor" )
            marcMapOn = config.get("%s.marcMap" % fieldOn, None)
            # do processing
            if processorNameOn == "standardProcessor":    # then just use the MARC extractor
                separatorOn = config.get("%s.marcMap.separator" % fieldOn, " ")
                stripTrailingPunctuation = int( config.get("%s.stripTrailingPunctuation" % fieldOn, "0") )
                processedResult = extractor.extract( marcMapOn, separator = separatorOn, stripTrailingPunctuation = stripTrailingPunctuation )
                if ((processedResult == None) or len(processedResult) == 0) and config.has_key("%s.marcMap.lastResort" % fieldOn ):
                    marcMapOn = config.get("%s.marcMap.lastResort" % fieldOn, None)
                    processedResult = extractor.extract( marcMapOn, separator = separatorOn )
            else:
                processorOn = getattr( _processors, processorNameOn )    
                processedResult = processorOn( record, marcMap=marcMapOn, extractor=extractor )
            # do post-processing based upon type
            typeOn = config.get("%s.type" % fieldOn, "multi")
            if typeOn == "single" and ( type(processedResult) == type([])) and len(processedResult) > 0:
                postProcessedResult = processedResult[0]
            elif typeOn == "singleTranslation":
                if( type(processedResult) == type([]) ):
                    if len(processedResult) >= 1:
                        processedResult = processedResult[0]
                    else:
                        processedResult = None
                translationMapName = config.get("%s.translationMap" % fieldOn, None)
                if translationMapName is not None:
                    _translationMapModule = __import__( "config.codes" , {},{},[''] )
                    _translationMap = getattr( _translationMapModule, translationMapName)
                    postProcessedResult = _translationMap.get( processedResult, None)
            else:
                postProcessedResult = processedResult
            # set own attribute
            if postProcessedResult is not None and len(postProcessedResult) > 0:
                setattr( self, fieldOn, postProcessedResult )
            self._extractionTime += ( time.time() - start )
Example #2
0
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Helios.  If not, see <http://www.gnu.org/licenses/>.

## indexes records from solr
from java.io import *
from java.net import *
#from org.marc4j import *
#from org.marc4j.converter.impl import *
import time, sys, urllib
#import csv
#from indexer import *
from loadPropsFile import *

indexerConfig = loadPropsFile("config/indexer.properties")
SOLR_COMMIT_MESSAGE = indexerConfig.get("SOLR_COMMIT_MESSAGE")
SOLR_OPTIMIZE_MESSAGE = indexerConfig.get("SOLR_OPTIMIZE_MESSAGE")
SOLR_DELETE_ID_MESSAGE = indexerConfig.get("SOLR_DELETE_ID_MESSAGE")
SOLR_BASE_URL = indexerConfig.get("SOLR_BASE_URL",
                                  "http://localhost:8983/solr")
SOLR_UPDATE_URL = indexerConfig.get("SOLR_UPDATE_URL")
SOLR_QUERY_URL = indexerConfig.get("SOLR_QUERY_URL")
MAX_RECORDS_TO_ADD = indexerConfig.get("MAX_RECORDS_TO_ADD")
SOLR_INDEX_BATCH_SIZE = indexerConfig.get("SOLR_INDEX_BATCH_SIZE")
SOLR_COMMIT_BATCH_SIZE = indexerConfig.get("SOLR_COMMIT_BATCH_SIZE")
PRINT_SOLR_POST_DATA = indexerConfig.get("PRINT_SOLR_POST_DATA")
PRINT_SOLR_RESP_DATA = indexerConfig.get("PRINT_SOLR_RESP_DATA")
PROFILE = indexerConfig.get("PROFILE")
DO_ACCESSION = indexerConfig.get("DO_ACCESSION")  ## currently not used!
 def __init__(self, marcRecord, anselUnicodeConverter = None, accession = None, profile=0, propsObject = None, indexerProperties="config/indexer.properties"):
     start = time.time()
     self._marcRecordToDictTime = 0
     self._extractionTime = 0
     self._extractorCreateTime = 0
     self._extractMethodTime = 0
     if profile:
         self._marcRecordToDictProfiling = {}
     
     if anselUnicodeConverter is None: 
         print "creating ansel -> unicode converter"    # csdebug
         anselUnicodeConverter = AnselToUnicode()
     if profile:
         record, _perfData = marcIndexingUtils.marcRecordToDict( marcRecord, anselUnicodeConverter )
         self._marcRecordToDictProfiling = _perfData
     else:
         record = marcIndexingUtils.marcRecordToDict( marcRecord, anselUnicodeConverter )
     self._marcRecordToDictTime = time.time() - start
     
     start = time.time()
     if not propsObject:
         config = loadPropsFile(indexerProperties)
     else:
         config = propsObject
     extractor = MarcExtractor( record )
     self._extractorCreateTime = (time.time() - start)
     
     # TODO: decide if this should be turn-offable or not
     self.marc_record = str( record )
     
     fieldsToDo = [x.strip() for x in config['active.fields'].split(",")]
     _processors = __import__('processors', {},{},[''])
     for fieldOn in fieldsToDo:
         start = time.time()
         processorNameOn = config.get( "%s.processor" % fieldOn, "standardProcessor" )
         marcMapOn = config.get("%s.marcMap" % fieldOn, None)
         # do processing
         if processorNameOn == "standardProcessor":    
             # then just use the MARC extractor
             separatorOn = config.get("%s.marcMap.separator" % fieldOn, " ")
             stripTrailingCommas = int( config.get("%s.stripTrailingCommas" % fieldOn, "0") )
             if stripTrailingCommas:
                 extractMethodStart = time.time()
                 processedResult = extractor.extract( marcMapOn, separator = separatorOn, trailingPunctuationToStrip = [","], stripTrailingPunctuation = 1 )
                 self._extractMethodTime += ( time.time() - extractMethodStart )
             else:
                 stripTrailingPunctuation = int( config.get("%s.stripTrailingPunctuation" % fieldOn, "0") )
                 try:
                     extractMethodStart = time.time()
                     processedResult = extractor.extract( marcMapOn, separator = separatorOn, stripTrailingPunctuation = stripTrailingPunctuation )
                     self._extractMethodTime += ( time.time() - extractMethodStart )
                 except AttributeError:
                     print "You do not have a correct marc mapping set up for field %s" % fieldOn
             if ((processedResult == None) or len(processedResult) == 0) and config.has_key("%s.marcMap.lastResort" % fieldOn ):
                 marcMapOn = config.get("%s.marcMap.lastResort" % fieldOn, None)
                 extractMethodStart = time.time()
                 processedResult = extractor.extract( marcMapOn, separator = separatorOn )
                 self._extractMethodTime += ( time.time() - extractMethodStart )
         else:
             # get and run custom processor
             processorOn = getattr( _processors, processorNameOn )    
             processedResult = processorOn( record, marcMap=marcMapOn, extractor=extractor )
         # do post-processing based upon type
         typeOn = config.get("%s.type" % fieldOn, "multi")
         if typeOn == "single" and ( type(processedResult) == type([])) and len(processedResult) > 0:
             postProcessedResult = processedResult[0]
         elif typeOn == "singleTranslation":
             if( type(processedResult) == type([]) ):
                 if len(processedResult) >= 1:
                     processedResult = processedResult[0]
                 else:
                     processedResult = None
             translationMapName = config.get("%s.translationMap" % fieldOn, None)
             if translationMapName is not None:
                 _translationMapModule = __import__( "config.codes" , {},{},[''] )
                 _translationMap = getattr( _translationMapModule, translationMapName)
                 postProcessedResult = _translationMap.get( processedResult, None)
         else:
             postProcessedResult = processedResult
         # deal with stripWhitespace after all other text manipulations
         stripWhitespace = int( config.get("%s.stripWhitespace" % fieldOn, "0") )
         if stripWhitespace:
             if type( postProcessedResult ) == type(""):
                 postProcessedResult = postProcessedResult.strip()
             elif type( postProcessedResult) == type([]):
                 postProcessedResult = [x.strip() for x in postProcessedResult]
         # FINALLY, set own attribute
         if postProcessedResult is not None and len(postProcessedResult) > 0:
             setattr( self, fieldOn, postProcessedResult )
         self._extractionTime += ( time.time() - start )
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with Helios.  If not, see <http://www.gnu.org/licenses/>.

## indexes records from solr
from java.io import *
from java.net import *
from org.marc4j import *
from org.marc4j.converter.impl import *
import time, sys, urllib

from indexer import *
from loadPropsFile import *

indexerConfig = loadPropsFile("config/indexer.properties")

SOLR_COMMIT_MESSAGE = indexerConfig.get( "SOLR_COMMIT_MESSAGE" )
SOLR_OPTIMIZE_MESSAGE = indexerConfig.get( "SOLR_OPTIMIZE_MESSAGE" )
SOLR_DELETE_ID_MESSAGE = indexerConfig.get("SOLR_DELETE_ID_MESSAGE" )
SOLR_BASE_URL = indexerConfig.get("SOLR_BASE_URL", "http://localhost:8888/solr" ) 
SOLR_UPDATE_URL = indexerConfig.get( "SOLR_UPDATE_URL" )
SOLR_QUERY_URL = indexerConfig.get( "SOLR_QUERY_URL" ) 
MAX_RECORDS_TO_ADD = indexerConfig.get("MAX_RECORDS_TO_ADD")
RECORDS_TO_SKIP = indexerConfig.get("RECORDS_TO_SKIP")
SOLR_INDEX_BATCH_SIZE = indexerConfig.get("SOLR_INDEX_BATCH_SIZE")
SOLR_COMMIT_BATCH_SIZE = indexerConfig.get("SOLR_COMMIT_BATCH_SIZE")
PRINT_SOLR_POST_DATA = indexerConfig.get("PRINT_SOLR_POST_DATA")
PRINT_SOLR_RESP_DATA = indexerConfig.get("PRINT_SOLR_RESP_DATA")
PROFILE = indexerConfig.get("PROFILE") 
DO_ACCESSION = indexerConfig.get("DO_ACCESSION") ## currently not used!
def processFile( filename, anselUnicodeConverter = None ):
    inStream = FileInputStream(filename)
    print "processing file <<%s>>" % filename
    marcReader = MarcStreamReader( inStream )
    data = ""
    count = 0
    lastCommitTime = None
    import time
    startTime = time.time()
    lastRecord = None
    m4j = None
    marcReaderTime = 0
    marcRecordToDictTime = 0
    extractorCreateTime = 0
    extractionTime = 0
    marcRecordForSolrTime = 0
    commitTime = 0
    updateTime = 0
    marcSerializeTime = 0
    
    indexesConfig = loadPropsFile("config/indexes.properties")
    
    accession = 0   # TODO: try and load pickled accession # from somewhere
    serializedRecord = None
    while marcReader.hasNext() and count < MAX_RECORDS_TO_ADD:
        print ".",
        accession += 1  
        count += 1
        try:
            mrTimeStart = time.time()                
            marc4jRecord = marcReader.next()
            marcReaderTime += ( time.time() - mrTimeStart )                
        except:
            print "last record indexed was %s " % serializedRecord
            import sys
            print "sys.exc_info is %s" % str(sys.exc_info())
            try:
                marc4jRecord = marcReader.next()    # unlikely to work but what the hey
            except:
                print "tried parsing again and failed. The lesson is, never try."
                sys.exit(1)
        mrsTime = time.time()
        if count < RECORDS_TO_SKIP:
            continue
        rec = recordForSolr( marc4jRecord, anselUnicodeConverter, config = indexesConfig )
        marcRecordForSolrTime += ( time.time() - mrsTime )
        extractionTime += rec._extractionTime
        extractorCreateTime += rec._extractorCreateTime
        marcRecordToDictTime += rec._marcRecordToDictTime
        mrserTime = time.time()            
        serializedRecord = rec.serialize()
        marcSerializeTime += ( time.time() - mrserTime )
	if rec.bib_num is not None:
            data += serializedRecord
            
        if( (count % SOLR_INDEX_BATCH_SIZE ) == 0):
           # nb. neither apache commons nor python urllib works right here!  Unicode gets mangled.  Must use postURL
            startUpdateTime = time.time()
            resp = postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data)
            # TODO: put in retry/continue code here for failed updates/slowdowns on Solr
            updateTime += ( time.time() - startUpdateTime )
            print "*",
            if PRINT_SOLR_POST_DATA:
                print "\n\n<add>%s</add>\n\n" % data
            data = ""
        if( ( count % SOLR_COMMIT_BATCH_SIZE) == 0):
            try:
                print "committing..."
                beginCommitTime = time.time()
                resp = postURL( SOLR_UPDATE_URL, SOLR_COMMIT_MESSAGE)
                if PRINT_SOLR_RESP_DATA:
                    print resp
                commitTime += ( time.time() - beginCommitTime )
            except IOError:
                import time
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)

            if lastCommitTime:
                thisBatchRate = ( ( 0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime) )
                overallRate = ( ( 0.0 + count ) / ( time.time() - startTime) )
                print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % (time.ctime(), count, thisBatchRate, overallRate)
                if PROFILE:
                    print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % ( marcReaderTime, marcRecordForSolrTime, marcSerializeTime )
                    print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n""" % ( marcRecordToDictTime, extractorCreateTime, extractionTime )
                    print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % ( updateTime, commitTime )                          
            lastCommitTime = time.time()
            System.gc()
    # do last batch here
    if len(data) > 0:
        print "doing final POST"
        resp = postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data)
        if PRINT_SOLR_RESP_DATA:
            print resp
    print "committing..."
    commit()
          
    inStream.close()
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1):
    # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking.
    inStream = FileInputStream(filename)
    print "processFile>> %s" % filename
    marcReader = MarcStreamReader(inStream)
    data = ""
    count = 0
    lastCommitTime = None
    import time

    startTime = time.time()
    lastRecord = None
    lastBibNum = None
    m4j = None
    marcReaderTime = 0
    marcRecordToDictTime = 0
    extractorCreateTime = 0
    extractionTime = 0
    extractMethodTime = 0
    marcRecordForSolrTime = 0
    commitTime = 0
    updateTime = 0
    marcSerializeTime = 0
    accession = 0  # TODO: try and load serialized accession # from somewhere
    serializedRecord = None
    recordBatch = []
    # get default properties file
    from loadPropsFile import *

    props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE)

    while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD:
        # if pid > -1:
        #    print (".%d" % pid),
        # else:
        #    print ".",
        # CSDEBUG
        accession += 1
        count += 1
        # TODO: improve error handling here (main problem is that Marc4J will fall over
        # at the sight of a bad record and there's no way to get it to just skip over
        # a bad record -- so there is little we can do, except better error messages!
        try:
            mrTimeStart = time.time()
            marc4jRecord = marcReader.next()
            marcReaderTime += time.time() - mrTimeStart
        except:
            print "last record indexed was bib# %s " % lastBibNum
            import sys

            print "sys.exc_info is %s" % str(sys.exc_info())
            sys.exit(1)

        mrsTime = time.time()
        # try:
        rec = solrIndexingUtils.recordForSolr(marc4jRecord, anselUnicodeConverter, propsObject=props)
        # except:
        #    print "exception processing record, skipping"    # TODO: error handling
        #    continue
        marcRecordForSolrTime += time.time() - mrsTime
        extractionTime += rec._extractionTime
        extractorCreateTime += rec._extractorCreateTime
        marcRecordToDictTime += rec._marcRecordToDictTime
        extractMethodTime += rec._extractMethodTime

        if hasattr(rec, "bib_num"):
            recordBatch.append(rec)
            lastBibNum = rec.bib_num
        else:
            print "not adding record %s; no bib_num present!" % rec

        if (count % SOLR_INDEX_BATCH_SIZE) == 0:
            # nb. neither apache commons nor python urllib works right here!  Unicode gets mangled.
            # Must use postURL

            # fetch the item status info if required.
            if DO_ITEM_STATUS_INDEXING:
                bibs = [x.bib_num for x in recordBatch]
                avail = horizonItemStatus.availableAt(bibs)
                for x in recordBatch:
                    x.available = avail[x.bib_num]

            mrserTime = time.time()
            data = u"".join([x.serialize() for x in recordBatch])
            recordBatch = []
            marcSerializeTime += time.time() - mrserTime

            startUpdateTime = time.time()
            try:
                resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)

            except IOError:
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
                resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
                # if it fails again here, we want to just bomb out.
            if resp.find('<result status="1"') > -1:
                print "\nError POSTing documents!  Response from Solr was\n\n%s\n" % resp
            # TODO: put in retry/continue code here for failed updates/slowdowns on Solr
            # TODO: parse result status and do something if there is an error (like print stacktrace)
            updateTime += time.time() - startUpdateTime
            if pid > -1:
                print ("*%d" % pid),
            else:
                print "*",
            if PRINT_SOLR_POST_DATA:
                print "\n\n<add>%s</add>\n\n" % data
            data = ""
        if (count % SOLR_COMMIT_BATCH_SIZE) == 0:
            try:
                print "committing..."
                beginCommitTime = time.time()
                if nonblocking:
                    print "doing nonblocking commit"
                    solrConnection.commitNonblocking()
                else:
                    solrConnection.commit()
                commitTime += time.time() - beginCommitTime
            except IOError:
                import time

                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
            if lastCommitTime:
                thisBatchRate = (0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime)
                overallRate = (0.0 + count) / (time.time() - startTime)
                if pid > -1:
                    print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid  # csdebug
                print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % (
                    time.ctime(),
                    count,
                    thisBatchRate,
                    overallRate,
                )
                if PROFILE:
                    print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % (
                        marcReaderTime,
                        marcRecordForSolrTime,
                        marcSerializeTime,
                    )
                    print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % (
                        marcRecordToDictTime,
                        extractorCreateTime,
                        extractionTime,
                        extractMethodTime,
                    )
                    print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (updateTime, commitTime)
            lastCommitTime = time.time()
        if (count % SOLR_OPTIMIZE_BATCH_SIZE) == 0:
            print "[%s] FORCING OPTIMIZE..." % time.ctime()
            solrConnection.optimize()
            print "[%s] OPTIMIZE done" % time.ctime()
            System.gc()
    # do last batch here
    if len(recordBatch) > 0:
        print "doing final POST"
        mrserTime = time.time()
        data = "".join([x.serialize() for x in recordBatch])
        recordBatch = []

        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
        if resp.find('<result status="1"') > -1:
            print "\nError POSTing documents!  Response from Solr was\n\n%s\n\n" % resp
    print "committing..."
    if nonblocking:
        solrConnection.commitNonblocking()
    else:
        solrConnection.commit()
    inStream.close()
    return count
"""
from java.lang import *
from java.io import *
from java.sql import *
from java.util import *

import sys, urllib, time

from loadPropsFile import *

from config.solr import *
import solrConnection

#1. get connection to the database

props = loadPropsFile("config/ILSConn.properties")

# TODO: decide if this should use splcommons.HorizonConn instead
dbType = props['db.type']
 
if dbType != "sybase":
    print "database type %s not yet supported." % dbType
    sys.exit(1)
else:
    connString = "jdbc:jtds:%(db.type)s://%(db.server)s:%(db.port)s/%(db.name)s" % props
    Class.forName( props['db.driver'] ).newInstance()
    conn = DriverManager.getConnection( connString, props['db.user'], props['db.password'] )


def updateSolrRecordAvailability( bibNum, availableLocations = [], doPost =1 ):
    """this function updates an already-indexed record from Solr with new location information.
"""
from java.lang import *
from java.io import *
from java.sql import *
from java.util import *

import sys, urllib, time

from loadPropsFile import *

from config.solr import *
import solrConnection

#1. get connection to the database

props = loadPropsFile("config/ILSConn.properties")

# TODO: decide if this should use splcommons.HorizonConn instead
dbType = props['db.type']

if dbType != "sybase":
    print "database type %s not yet supported." % dbType
    sys.exit(1)
else:
    connString = "jdbc:jtds:%(db.type)s://%(db.server)s:%(db.port)s/%(db.name)s" % props
    Class.forName(props['db.driver']).newInstance()
    conn = DriverManager.getConnection(connString, props['db.user'],
                                       props['db.password'])


def updateSolrRecordAvailability(bibNum, availableLocations=[], doPost=1):
    def __init__(self,
                 marcRecord,
                 anselUnicodeConverter=None,
                 accession=None,
                 profile=0,
                 propsObject=None,
                 indexerProperties="config/indexer.properties"):
        start = time.time()
        self._marcRecordToDictTime = 0
        self._extractionTime = 0
        self._extractorCreateTime = 0
        self._extractMethodTime = 0
        if profile:
            self._marcRecordToDictProfiling = {}

        if anselUnicodeConverter is None:
            print "creating ansel -> unicode converter"  # csdebug
            anselUnicodeConverter = AnselToUnicode()
        if profile:
            record, _perfData = marcIndexingUtils.marcRecordToDict(
                marcRecord, anselUnicodeConverter)
            self._marcRecordToDictProfiling = _perfData
        else:
            record = marcIndexingUtils.marcRecordToDict(
                marcRecord, anselUnicodeConverter)
        self._marcRecordToDictTime = time.time() - start

        start = time.time()
        if not propsObject:
            config = loadPropsFile(indexerProperties)
        else:
            config = propsObject
        extractor = MarcExtractor(record)
        self._extractorCreateTime = (time.time() - start)

        # TODO: decide if this should be turn-offable or not
        self.marc_record = str(record)

        fieldsToDo = [x.strip() for x in config['active.fields'].split(",")]
        _processors = __import__('processors', {}, {}, [''])
        for fieldOn in fieldsToDo:
            start = time.time()
            processorNameOn = config.get("%s.processor" % fieldOn,
                                         "standardProcessor")
            marcMapOn = config.get("%s.marcMap" % fieldOn, None)
            # do processing
            if processorNameOn == "standardProcessor":
                # then just use the MARC extractor
                separatorOn = config.get("%s.marcMap.separator" % fieldOn, " ")
                stripTrailingCommas = int(
                    config.get("%s.stripTrailingCommas" % fieldOn, "0"))
                if stripTrailingCommas:
                    extractMethodStart = time.time()
                    processedResult = extractor.extract(
                        marcMapOn,
                        separator=separatorOn,
                        trailingPunctuationToStrip=[","],
                        stripTrailingPunctuation=1)
                    self._extractMethodTime += (time.time() -
                                                extractMethodStart)
                else:
                    stripTrailingPunctuation = int(
                        config.get("%s.stripTrailingPunctuation" % fieldOn,
                                   "0"))
                    try:
                        extractMethodStart = time.time()
                        processedResult = extractor.extract(
                            marcMapOn,
                            separator=separatorOn,
                            stripTrailingPunctuation=stripTrailingPunctuation)
                        self._extractMethodTime += (time.time() -
                                                    extractMethodStart)
                    except AttributeError:
                        print "You do not have a correct marc mapping set up for field %s" % fieldOn
                if ((processedResult == None)
                        or len(processedResult) == 0) and config.has_key(
                            "%s.marcMap.lastResort" % fieldOn):
                    marcMapOn = config.get("%s.marcMap.lastResort" % fieldOn,
                                           None)
                    extractMethodStart = time.time()
                    processedResult = extractor.extract(marcMapOn,
                                                        separator=separatorOn)
                    self._extractMethodTime += (time.time() -
                                                extractMethodStart)
            else:
                # get and run custom processor
                processorOn = getattr(_processors, processorNameOn)
                processedResult = processorOn(record,
                                              marcMap=marcMapOn,
                                              extractor=extractor)
            # do post-processing based upon type
            typeOn = config.get("%s.type" % fieldOn, "multi")
            if typeOn == "single" and (type(processedResult) == type(
                [])) and len(processedResult) > 0:
                postProcessedResult = processedResult[0]
            elif typeOn == "singleTranslation":
                if (type(processedResult) == type([])):
                    if len(processedResult) >= 1:
                        processedResult = processedResult[0]
                    else:
                        processedResult = None
                translationMapName = config.get("%s.translationMap" % fieldOn,
                                                None)
                if translationMapName is not None:
                    _translationMapModule = __import__("config.codes", {}, {},
                                                       [''])
                    _translationMap = getattr(_translationMapModule,
                                              translationMapName)
                    postProcessedResult = _translationMap.get(
                        processedResult, None)
            else:
                postProcessedResult = processedResult
            # deal with stripWhitespace after all other text manipulations
            stripWhitespace = int(
                config.get("%s.stripWhitespace" % fieldOn, "0"))
            if stripWhitespace:
                if type(postProcessedResult) == type(""):
                    postProcessedResult = postProcessedResult.strip()
                elif type(postProcessedResult) == type([]):
                    postProcessedResult = [
                        x.strip() for x in postProcessedResult
                    ]
            # FINALLY, set own attribute
            if postProcessedResult is not None and len(
                    postProcessedResult) > 0:
                setattr(self, fieldOn, postProcessedResult)
            self._extractionTime += (time.time() - start)
Example #10
0
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1):
    # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking.
    inStream = FileInputStream(filename)
    print "processFile>> %s" % filename
    marcReader = MarcStreamReader(inStream)
    data = ""
    count = 0
    lastCommitTime = None
    import time
    startTime = time.time()
    lastRecord = None
    lastBibNum = None
    m4j = None
    marcReaderTime = 0
    marcRecordToDictTime = 0
    extractorCreateTime = 0
    extractionTime = 0
    extractMethodTime = 0
    marcRecordForSolrTime = 0
    commitTime = 0
    updateTime = 0
    marcSerializeTime = 0
    accession = 0  # TODO: try and load serialized accession # from somewhere
    serializedRecord = None
    recordBatch = []
    # get default properties file
    from loadPropsFile import *
    props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE)

    while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD:
        #if pid > -1:
        #    print (".%d" % pid),
        #else:
        #    print ".",
        # CSDEBUG
        accession += 1
        count += 1
        # TODO: improve error handling here (main problem is that Marc4J will fall over
        # at the sight of a bad record and there's no way to get it to just skip over
        # a bad record -- so there is little we can do, except better error messages!
        try:
            mrTimeStart = time.time()
            marc4jRecord = marcReader.next()
            marcReaderTime += (time.time() - mrTimeStart)
        except:
            print "last record indexed was bib# %s " % lastBibNum
            import sys
            print "sys.exc_info is %s" % str(sys.exc_info())
            sys.exit(1)

        mrsTime = time.time()
        #try:
        rec = solrIndexingUtils.recordForSolr(marc4jRecord,
                                              anselUnicodeConverter,
                                              propsObject=props)
        #except:
        #    print "exception processing record, skipping"    # TODO: error handling
        #    continue
        marcRecordForSolrTime += (time.time() - mrsTime)
        extractionTime += rec._extractionTime
        extractorCreateTime += rec._extractorCreateTime
        marcRecordToDictTime += rec._marcRecordToDictTime
        extractMethodTime += rec._extractMethodTime

        if hasattr(rec, "bib_num"):
            recordBatch.append(rec)
            lastBibNum = rec.bib_num
        else:
            print "not adding record %s; no bib_num present!" % rec

        if ((count % SOLR_INDEX_BATCH_SIZE) == 0):
            # nb. neither apache commons nor python urllib works right here!  Unicode gets mangled.
            #Must use postURL

            # fetch the item status info if required.
            if DO_ITEM_STATUS_INDEXING:
                bibs = [x.bib_num for x in recordBatch]
                avail = horizonItemStatus.availableAt(bibs)
                for x in recordBatch:
                    x.available = avail[x.bib_num]

            mrserTime = time.time()
            data = u''.join([x.serialize() for x in recordBatch])
            recordBatch = []
            marcSerializeTime += (time.time() - mrserTime)

            startUpdateTime = time.time()
            try:
                resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                              "<add>%s</add>" % data)

            except IOError:
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
                resp = solrConnection.postURL(SOLR_UPDATE_URL,
                                              "<add>%s</add>" % data)
                # if it fails again here, we want to just bomb out.
            if resp.find('<result status="1"') > -1:
                print "\nError POSTing documents!  Response from Solr was\n\n%s\n" % resp
            # TODO: put in retry/continue code here for failed updates/slowdowns on Solr
            # TODO: parse result status and do something if there is an error (like print stacktrace)
            updateTime += (time.time() - startUpdateTime)
            if pid > -1:
                print("*%d" % pid),
            else:
                print "*",
            if PRINT_SOLR_POST_DATA:
                print "\n\n<add>%s</add>\n\n" % data
            data = ""
        if ((count % SOLR_COMMIT_BATCH_SIZE) == 0):
            try:
                print "committing..."
                beginCommitTime = time.time()
                if nonblocking:
                    print "doing nonblocking commit"
                    solrConnection.commitNonblocking()
                else:
                    solrConnection.commit()
                commitTime += (time.time() - beginCommitTime)
            except IOError:
                import time
                print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec."
                time.sleep(10)
            if lastCommitTime:
                thisBatchRate = ((0.0 + SOLR_COMMIT_BATCH_SIZE) /
                                 (time.time() - lastCommitTime))
                overallRate = ((0.0 + count) / (time.time() - startTime))
                if pid > -1:
                    print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid  # csdebug
                print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % (
                    time.ctime(), count, thisBatchRate, overallRate)
                if PROFILE:
                    print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % (
                        marcReaderTime, marcRecordForSolrTime,
                        marcSerializeTime)
                    print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % (
                        marcRecordToDictTime, extractorCreateTime,
                        extractionTime, extractMethodTime)
                    print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (
                        updateTime, commitTime)
            lastCommitTime = time.time()
        if ((count % SOLR_OPTIMIZE_BATCH_SIZE) == 0):
            print "[%s] FORCING OPTIMIZE..." % time.ctime()
            solrConnection.optimize()
            print "[%s] OPTIMIZE done" % time.ctime()
            System.gc()
    # do last batch here
    if len(recordBatch) > 0:
        print "doing final POST"
        mrserTime = time.time()
        data = ''.join([x.serialize() for x in recordBatch])
        recordBatch = []

        resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data)
        if resp.find('<result status="1"') > -1:
            print "\nError POSTing documents!  Response from Solr was\n\n%s\n\n" % resp
    print "committing..."
    if nonblocking:
        solrConnection.commitNonblocking()
    else:
        solrConnection.commit()
    inStream.close()
    return count