コード例 #1
0
def Loader(argv):
    (databaseName, volumeName, temporaryDatabaseName, temporaryVolumeName,
     dataFileName, pagerankFileName, pagerankMode) = ParseArgument(argv)

    # mount database
    print "-" * 79
    print "Mount database '%s'" % (databaseName)
    oosqlSystem = PyOOSQL.OOSQL_System()
    databaseID = oosqlSystem.MountDB(databaseName)
    volumeID = oosqlSystem.GetVolumeID(databaseID, volumeName)
    temporaryVolumeID = oosqlSystem.MountVolumeByVolumeName(
        temporaryDatabaseName, temporaryVolumeName)

    # set bulk flush mode
    oosqlSystem.SetCfgParam("USE_BULKFLUSH", "TRUE")

    # begin transaction
    print "-" * 79
    print "Transaction begin"
    oosqlSystem.TransBegin(PyOOSQL.X_RR_RR)

    # get className and attribute list in the datafile
    f = open(dataFileName, "r")
    classLine = f.readline()
    f.close()
    # replace '(' and ')' into <space>
    classLine = string.join(string.split(classLine, '('), ' ')
    classLine = string.join(string.split(classLine, ')'), ' ')
    # retrives className and attributesInDatafile
    classLineSplitted = string.split(classLine)
    className = classLineSplitted[1]
    attributesInDatafile = classLineSplitted[2:]

    # get text attributes in the database schema
    (className, classId, attributeInfos,
     indexInfos) = oosqlSystem.GetTableDescription(volumeID, className)
    attributes = []
    for attributeInfo in attributeInfos:
        if attributeInfo[2] == PyOOSQL.OOSQL_TYPE_TEXT:
            if attributeInfo[0] in attributesInDatafile:
                attributes.append(attributeInfo[0])

    # prepare temporary path
    env_ODYS_TEMP_PATH = os.environ["ODYS_TEMP_PATH"]

    if sys.platform == "win32":
        dirSeparator = "\\"
    else:
        dirSeparator = "/"

    # determine loadingMode : INITIAL_BULKLOADING, SMALL_APPEND_BULKLOADING, MEDIUM_APPEND_BULKLOADING, LARGE_APPEND_BULKLOADING
    numObjectsInDatabase = oosqlSystem.GetNumObjectsInClass(
        volumeID, className)
    numObjectsInFile = PyOOSQL.CountObjectsInLoadDbFile(dataFileName)
    print "Objects in the class '%s' is %d" % (className, numObjectsInDatabase)
    print "Objects in the file '%s' is %d" % (dataFileName, numObjectsInFile)
    if numObjectsInDatabase == 0:
        loadingMode = INITIAL_BULKLOADING
    elif numObjectsInFile < 2000:
        loadingMode = SMALL_APPEND_BULKLOADING
    elif numObjectsInDatabase * 0.1 > numObjectsInFile:
        loadingMode = MEDIUM_APPEND_BULKLOADING
    else:
        loadingMode = LARGE_APPEND_BULKLOADING
    print "Loading Mode :", loadingMode

    # loaddb
    print "-" * 79
    print "Load data from '%s'" % (dataFileName)
    isDeferredTextIndexMode = 1
    useBulkloading = 1
    useDescriptorUpdating = 1
    if loadingMode == SMALL_APPEND_BULKLOADING or loadingMode == MEDIUM_APPEND_BULKLOADING:
        smallUpdateFlag = 1
    else:
        smallUpdateFlag = 0
    oosqlSystem.Tool_LoadDB(volumeID, temporaryVolumeID,
                            isDeferredTextIndexMode, smallUpdateFlag,
                            useBulkloading, useDescriptorUpdating,
                            dataFileName, pagerankFileName, pagerankMode)

    # mapping
    env_ODYS_TEMP_PATH = os.environ["ODYS_TEMP_PATH"]
    if sys.platform == "win32":
        dirSeparator = "\\"
    else:
        dirSeparator = "/"

    for attrName in attributes:
        print "-" * 79
        print "Map posting for class '%s', attribute '%s'" % (className,
                                                              attrName)
        sortedPostingExist = CheckIfSortedPostingExist(dataFileName, className,
                                                       attrName)
        if not sortedPostingExist:
            postingFileName = "%s_TEXT_%s_%s_Posting" % (dataFileName,
                                                         className, attrName)
            postingFileName = string.join(
                string.split(postingFileName, dirSeparator), '_')

            oosqlSystem.Tool_MapPosting(
                volumeID, className, attrName, [postingFileName],
                "TEXT_%s_%s_Posting_Mapped" % (className, attrName),
                "TEXT_%s_OID" % (className), 0, pagerankFileName, pagerankMode)

            # rename mapped posting into posting
            srcName = "%s%sTEXT_%s_%s_Posting_Mapped" % (
                env_ODYS_TEMP_PATH, dirSeparator, className, attrName)
            destName = "%s%sTEXT_%s_%s_Posting" % (
                env_ODYS_TEMP_PATH, dirSeparator, className, attrName)
            try:
                os.unlink(destName)
            except OSError:
                pass
            os.rename(srcName, destName)
        else:
            postingFileName = "%s_TEXT_%s_%s_SortedPosting" % (
                dataFileName, className, attrName)
            postingFileName = string.join(
                string.split(postingFileName, dirSeparator), '_')

            oosqlSystem.Tool_MapPosting(
                volumeID, className, attrName, [postingFileName],
                "TEXT_%s_%s_SortedPosting_Mapped" % (className, attrName),
                "TEXT_%s_OID" % (className), 0, pagerankFileName, pagerankMode)

            # rename mapped posting into posting
            srcName = "%s%sTEXT_%s_%s_SortedPosting_Mapped" % (
                env_ODYS_TEMP_PATH, dirSeparator, className, attrName)
            destName = "%s%sTEXT_%s_%s_SortedPosting" % (
                env_ODYS_TEMP_PATH, dirSeparator, className, attrName)
            try:
                os.unlink(destName)
            except OSError:
                pass
            os.rename(srcName, destName)

    # build text index
    config = PyOOSQL.lom_Text_ConfigForInvertedIndexBuild()
    if loadingMode == SMALL_APPEND_BULKLOADING:
        config.isUsingBulkLoading = 0
        config.isUsingKeywordIndexBulkLoading = 0
        config.isUsingReverseKeywordIndexBulkLoading = 0
    elif loadingMode == MEDIUM_APPEND_BULKLOADING:
        config.isUsingBulkLoading = 1
        config.isUsingKeywordIndexBulkLoading = 0
        config.isUsingReverseKeywordIndexBulkLoading = 0
    elif loadingMode == INITIAL_BULKLOADING or loadingMode == LARGE_APPEND_BULKLOADING:
        config.isUsingBulkLoading = 1
        config.isUsingKeywordIndexBulkLoading = 1
        config.isUsingReverseKeywordIndexBulkLoading = 1
    config.isBuildingExternalReverseKeywordFile = 0
    config.isBuildingDocIdIndex = 1
    config.isSortingPostingFile = 1
    config.isUsingStoredPosting = 0

    for attrName in attributes:
        print "-" * 79
        print "Build text index for class '%s', attribute '%s'" % (className,
                                                                   attrName)
        sortedPostingExist = CheckIfSortedPostingExist(dataFileName, className,
                                                       attrName)
        if sortedPostingExist:
            config.isSortingPostingFile = 0
        else:
            config.isSortingPostingFile = 1
        oosqlSystem.Tool_BuildTextIndex(volumeID, temporaryVolumeID, className,
                                        attrName, config)

    # commit transaction
    print "-" * 79
    print "Transaction commit"
    oosqlSystem.TransCommit()

    # dismount database
    print "-" * 79
    print "Dismount database"
    oosqlSystem.Dismount(temporaryVolumeID)
    oosqlSystem.DismountDB(databaseID)

    # unlink posting file and sorted posting file
    for attrName in attributes:
        postingFileName = "TEXT_%s_%s_Posting" % (className, attrName)
        destName = "%s%s%s" % (env_ODYS_TEMP_PATH, dirSeparator,
                               postingFileName)
        try:
            os.unlink(destName)
        except OSError:
            pass
        postingFileName = "%s_TEXT_%s_%s_Posting" % (dataFileName, className,
                                                     attrName)
        postingFileName = string.join(
            string.split(postingFileName, dirSeparator), '_')
        destName = "%s%s%s" % (env_ODYS_TEMP_PATH, dirSeparator,
                               postingFileName)
        try:
            os.unlink(destName)
        except OSError:
            pass
        sortedPostingFileName = "TEXT_%s_%s_SortedPosting" % (className,
                                                              attrName)
        sortedPostingFileName = string.join(
            string.split(sortedPostingFileName, dirSeparator), '_')
        destName = "%s%s%s" % (env_ODYS_TEMP_PATH, dirSeparator,
                               sortedPostingFileName)
        try:
            os.unlink(destName)
        except OSError:
            pass

        sortedPostingFileName = "%s_TEXT_%s_%s_SortedPosting" % (
            dataFileName, className, attrName)
        sortedPostingFileName = string.join(
            string.split(sortedPostingFileName, dirSeparator), '_')
        destName = "%s%s%s" % (env_ODYS_TEMP_PATH, dirSeparator,
                               sortedPostingFileName)
        try:
            os.unlink(destName)
        except OSError:
            pass

    # unlink oid file
    oidFileName = "TEXT_%s_OID" % (className)
    destName = "%s%s%s" % (env_ODYS_TEMP_PATH, dirSeparator, oidFileName)
    try:
        os.unlink(destName)
    except OSError:
        pass
コード例 #2
0
def LoaderParallelExtract(argv):
    (databaseName, volumeName, temporaryDatabaseName, temporaryVolumeName,
     dataFileNameList, mergedFileName, divideNumber,
     mergedPostingFlag) = ParseArgument(argv)

    if len(dataFileNameList) == 1:
        dataFileName = dataFileNameList[0]
    else:
        numSourceDataFiles = PyOOSQL.MergeDataInLoadDbFiles(
            dataFileNameList, mergedFileName)
        dataFileName = mergedFileName

    numObjectsInFile = PyOOSQL.CountObjectsInLoadDbFile(dataFileName)
    numObjectsToExtract = numObjectsInFile / divideNumber

    extractorName = '_ExtractKeyword.py'

    startObjectNo = 0
    endObjectNo = numObjectsToExtract - 1

    for i in range(0, divideNumber):

        if i == divideNumber - 1:
            endObjectNo = -1

        arguments = []
        arguments.append(extractorName)  # program
        arguments.append(databaseName)  # database name
        arguments.append(volumeName)  # volume name
        arguments.append('-temporary')  # -temporary
        arguments.append(temporaryDatabaseName)  # temporary database name
        arguments.append(temporaryVolumeName)  # temporary volume name
        arguments.append(dataFileName)  # data file name
        arguments.append(str(startObjectNo))  # start object no
        arguments.append(str(endObjectNo))  # end object no
        arguments.append(str(i))  # process no

        pid = os.fork()
        if not pid:
            # child process execution part
            # execute keyword extractor
            os.execvp(extractorName, arguments)
        else:
            # parent process execution part
            # execute monitoring thread
            thread.start_new_thread(MonitorProcess, (
                i,
                pid,
                extractorName,
                arguments,
                divideNumber,
            ))

            # adjust start and end object no.
            startObjectNo = endObjectNo + 1
            endObjectNo = startObjectNo + numObjectsToExtract - 1

    # execute polling
    while 1:
        # if all child process are exited
        if nExitedProcesses == divideNumber:
            break
        else:
            time.sleep(1)

    # handle -storeMergedPosting argument
    if mergedPostingFlag:
        # merge sorted posting files which are extracted in parallel
        LoaderMergePosting(databaseName, volumeName, temporaryDatabaseName,
                           temporaryVolumeName, dataFileName)
        print "Parallel keyword extraction is done"
    else:
        # nothing to do
        print "Parallel keyword extraction is done"