def processFilesInDirectory(dirName, anselUnicodeConverter = None, commitNonblocking =0, numThreads = 1, deleteAfterIndexing = 1):
    """processes MARC and PMS files in the indexer-queue directory.  If numThreads > 1
    it will try to parallelize the MARC record processing (but not PMS indexing -- no reason for that)     
    """
    pmsFiles = glob.glob( "%s/PMS*.DAT" % dirName )
    updatedAnyRecords = 0
    count= 0
    for fileOn in pmsFiles:
        print "processing PMS file %s" % fileOn
        processedFilenameOn, deletedBibsOn = processPMSFile(fileOn)
        if processedFilenameOn:
            print "processing MARC file %s" % processedFilenameOn
            indexerDriver.processFile( processedFilenameOn, anselUnicodeConverter )
            
            # now that we are done processing the file, we delete it.
            print "deleting MARC file %s " % processedFilenameOn
            os.remove( processedFilenameOn )
            print "deleting PMS file %s" % fileOn
            os.remove( fileOn )
            
            if deletedBibsOn:
                print "processing deleted bibs from MARC file %s" % processedFilenameOn
                for bibOn in deletedBibsOn:
                    print "deleting bib %s" % bibOn
                    indexerDriver.deleteRecord( bibOn )
            updatedAnyRecords = 1
        else:
            print "no records to index"
            os.remove( fileOn )

    print "[%s] now checking for MARC files" % time.ctime()
    _marcFiles = glob.glob( "%s/*.MARC" % dirName)
    _marcFiles += glob.glob ("%s/*.marc" % dirName )
    _marcFiles += glob.glob ("%s/*.dat" % dirName )
    _marcFiles += glob.glob ("%s/*.DAT" % dirName )
    _marcFiles += glob.glob( "%s/*scriblio*" % dirName )
    # dedupe _marcFiles here incase a file matches more than one glob
    # using a dictionary is the fastest way to dedupe a list with Jython
    marcFileDict = {}
    for fileOn in _marcFiles:
        marcFileDict[fileOn] = None
    marcFiles = marcFileDict.keys()
    marcFiles.sort()
    
    numMarcFiles = len(marcFiles)
    print "[%s] found %d files to process." % (time.ctime(), numMarcFiles )
    if numThreads == 1:
        for fileOn in marcFiles:
            print "processing MARC file %s" % fileOn
            count = indexerDriver.processFile( fileOn, anselUnicodeConverter, nonblocking=1 ) # csdebug: added nonblocking here
            updatedAnyRecords = 1
            if deleteAfterIndexing:
                os.remove( fileOn )
    elif numThreads >= numMarcFiles:
        # spin off a thread for each one
        # was getting weird problems with multithreading (AttributeErrors when trying to iterate
        # over all controlFields in MARC record -- trying separate anselUnicodeConverters to see if that's the issue.
        threads = []
        threadrefs = []
        i = 0
        for fileOn in marcFiles:
            convOn = AnselToUnicode()
            jobOn = indexerDriver.processFileJob( fileOn, convOn, nonblocking = 1, pid = i)# csdebug: handle nonblocking option
            _threadOn = Thread( jobOn, "process file job %s" %i )
            threads.append( _threadOn )
            threadrefs.append( jobOn )
            print "starting thread %s processing file %s" % (i, fileOn)
            _threadOn.start() 
            i += 1
            updatedAnyRecords = 1
        print "joining threads"
        for i in range( len(threads) ):
            threads[i].join()
            # TODO: make sure the thread was successful before nuking.
            if deleteAfterIndexing:
                print "deleting %s" % threadrefs[i].filename
                os.remove( threadrefs[i].filename )
            
    else:
        # do work queue here.
        print "not yet implemented"
    # finally, do a commit here.
    if updatedAnyRecords:
        print "[%s] starting final commit" % time.ctime()
        if commitNonblocking:
            solrConnection.commitNonblocking()
        else:
            solrConnection.commit()
        print "[%s] done committing" % time.ctime()
    return count
        fOut = open( fOutName, "w" )
        fOut.write( data )
        fOut.flush()
        fOut.close()
    else:
        fOutName = None
    return fOutName, deletedBibs

if __name__ == '__main__':
    pmsFiles = glob.glob( "%s/PMS*.DAT" % HORIZON_BASE_DIR )
    for fileOn in pmsFiles:
        print "processing PMS file %s" % fileOn
        processedFilenameOn, deletedBibsOn = processPMSFile(fileOn)
        if processedFilenameOn:
            print "processing MARC file %s" % processedFilenameOn
            indexerDriver.processFile( processedFilenameOn )
            
            # now that we are done processing the file, we delete it.
            print "deleting MARC file %s " % processedFilenameOn
            os.remove( processedFilenameOn )
            print "deleting PMS file %s" % fileOn
            os.remove( fileOn )
            
            if deletedBibsOn:
                print "processing deleted bibs from MARC file %s" % processedFilenameOn
                for bibOn in deletedBibsOn:
                    print "deleting bib %s" % bibOn
                    indexerDriver.deleteRecord( bibOn )
        else:
            print "no records to index"
            os.remove( fileOn )
Example #3
0
def processFilesInDirectory(dirName,
                            anselUnicodeConverter=None,
                            commitNonblocking=0,
                            numThreads=1,
                            deleteAfterIndexing=1):
    """processes MARC and PMS files in the indexer-queue directory.  If numThreads > 1
    it will try to parallelize the MARC record processing (but not PMS indexing -- no reason for that)     
    """
    pmsFiles = glob.glob("%s/PMS*.DAT" % dirName)
    updatedAnyRecords = 0
    count = 0
    for fileOn in pmsFiles:
        print "processing PMS file %s" % fileOn
        processedFilenameOn, deletedBibsOn = processPMSFile(fileOn)
        if processedFilenameOn:
            print "processing MARC file %s" % processedFilenameOn
            indexerDriver.processFile(processedFilenameOn,
                                      anselUnicodeConverter)

            # now that we are done processing the file, we delete it.
            print "deleting MARC file %s " % processedFilenameOn
            os.remove(processedFilenameOn)
            print "deleting PMS file %s" % fileOn
            os.remove(fileOn)

            if deletedBibsOn:
                print "processing deleted bibs from MARC file %s" % processedFilenameOn
                for bibOn in deletedBibsOn:
                    print "deleting bib %s" % bibOn
                    indexerDriver.deleteRecord(bibOn)
            updatedAnyRecords = 1
        else:
            print "no records to index"
            os.remove(fileOn)

    print "[%s] now checking for MARC files" % time.ctime()
    _marcFiles = glob.glob("%s/*.MARC" % dirName)
    _marcFiles += glob.glob("%s/*.marc" % dirName)
    _marcFiles += glob.glob("%s/*.dat" % dirName)
    _marcFiles += glob.glob("%s/*.DAT" % dirName)
    _marcFiles += glob.glob("%s/*scriblio*" % dirName)
    # dedupe _marcFiles here incase a file matches more than one glob
    # using a dictionary is the fastest way to dedupe a list with Jython
    marcFileDict = {}
    for fileOn in _marcFiles:
        marcFileDict[fileOn] = None
    marcFiles = marcFileDict.keys()
    marcFiles.sort()

    numMarcFiles = len(marcFiles)
    print "[%s] found %d files to process." % (time.ctime(), numMarcFiles)
    if numThreads == 1:
        for fileOn in marcFiles:
            print "processing MARC file %s" % fileOn
            count = indexerDriver.processFile(
                fileOn, anselUnicodeConverter,
                nonblocking=1)  # csdebug: added nonblocking here
            updatedAnyRecords = 1
            if deleteAfterIndexing:
                os.remove(fileOn)
    elif numThreads >= numMarcFiles:
        # spin off a thread for each one
        # was getting weird problems with multithreading (AttributeErrors when trying to iterate
        # over all controlFields in MARC record -- trying separate anselUnicodeConverters to see if that's the issue.
        threads = []
        threadrefs = []
        i = 0
        for fileOn in marcFiles:
            convOn = AnselToUnicode()
            jobOn = indexerDriver.processFileJob(
                fileOn, convOn, nonblocking=1,
                pid=i)  # csdebug: handle nonblocking option
            _threadOn = Thread(jobOn, "process file job %s" % i)
            threads.append(_threadOn)
            threadrefs.append(jobOn)
            print "starting thread %s processing file %s" % (i, fileOn)
            _threadOn.start()
            i += 1
            updatedAnyRecords = 1
        print "joining threads"
        for i in range(len(threads)):
            threads[i].join()
            # TODO: make sure the thread was successful before nuking.
            if deleteAfterIndexing:
                print "deleting %s" % threadrefs[i].filename
                os.remove(threadrefs[i].filename)

    else:
        # do work queue here.
        print "not yet implemented"
    # finally, do a commit here.
    if updatedAnyRecords:
        print "[%s] starting final commit" % time.ctime()
        if commitNonblocking:
            solrConnection.commitNonblocking()
        else:
            solrConnection.commit()
        print "[%s] done committing" % time.ctime()
    return count
Example #4
0
        fOut.write(data)
        fOut.flush()
        fOut.close()
    else:
        fOutName = None
    return fOutName, deletedBibs


if __name__ == '__main__':
    pmsFiles = glob.glob("%s/PMS*.DAT" % HORIZON_BASE_DIR)
    for fileOn in pmsFiles:
        print "processing PMS file %s" % fileOn
        processedFilenameOn, deletedBibsOn = processPMSFile(fileOn)
        if processedFilenameOn:
            print "processing MARC file %s" % processedFilenameOn
            indexerDriver.processFile(processedFilenameOn)

            # now that we are done processing the file, we delete it.
            print "deleting MARC file %s " % processedFilenameOn
            os.remove(processedFilenameOn)
            print "deleting PMS file %s" % fileOn
            os.remove(fileOn)

            if deletedBibsOn:
                print "processing deleted bibs from MARC file %s" % processedFilenameOn
                for bibOn in deletedBibsOn:
                    print "deleting bib %s" % bibOn
                    indexerDriver.deleteRecord(bibOn)
        else:
            print "no records to index"
            os.remove(fileOn)