def findProjects(sCorpusPath, dConfig):

    qRedis = RedisQueue(dConfig['redis-queue-project-paths'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    iCount = 0

    for sRoot, lDirs, lFiles in os.walk(sCorpusPath):

        iLevel = sRoot.count(os.sep)

        if iLevel >= 11:

            del lDirs[:]

        if iLevel == 11:

            if dConfig['debug']: debug('func: findProjects()', 'projects-root:', sRoot, iLevel)
            debug('func: findProjects()', 'projects-root:', sRoot, iLevel)
            
            qRedis.put(sRoot)
            
            iCount += 1

            if dConfig['debug'] and iCount >= 10: break

    printMsg('func: findProjects()', str(iCount), 'projects loaded into queue for processing')
def writeBuildSummaries(dConfig):

    qRedis = RedisQueue(dConfig['redis-queue-json'],
                        namespace='queue',
                        host=dConfig['redis-loc'],
                        port=dConfig['redis-port'])

    while 1:

        # get next project summary to process
        sProjectSummary = qRedis.get(block=True, timeout=30)

        if sProjectSummary:

            # do something with summary
            dProjectSummary = json.loads(sProjectSummary)

            #sBuildPath = os.path.relpath(dProjectSummary['sourcePath'], '/nfscorpus/nfscorpus')
            #sBuildPath = os.path.join('/nfsbuild/nfsbuild', sBuildPath)
            if "_8tof" in dProjectSummary['sourcePath']:
                sBuildPath = os.path.relpath(dProjectSummary['sourcePath'],
                                             '/data/corpus_8tof')
                sBuildPath = os.path.join('/data/builder_SAN/outputCyber',
                                          sBuildPath)
            if "_0to7" in dProjectSummary['sourcePath']:
                sBuildPath = os.path.relpath(dProjectSummary['sourcePath'],
                                             '/data/corpus_0to7')
                sBuildPath = os.path.join('/data/builder_SAN/outputCyber',
                                          sBuildPath)

            (sBuildPath, _) = os.path.split(sBuildPath)

            # ensure build directory exists
            sCmd = 'mkdir -p ' + sBuildPath
            if dConfig['debug']:
                debug('func: writeBuildSummaries() mkdir cmd:', sCmd)

            os.system(sCmd)

            sJsonPath = os.path.join(sBuildPath, 'build.json')
            if dConfig['debug']:
                debug('func: writeBuildSummaries() sJsonPath:', sJsonPath)

            with open(sJsonPath, 'w') as fJson:

                fJson.write(json.dumps(dProjectSummary, indent=4))

        else:

            break
def initProjects(dConfig):

    # flush project queue; queue used to traverse projects (reset every time)
    qRedis = RedisQueue(dConfig['redis-queue-project-paths'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])
    qRedis.flush()

    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])

    dMp.open()

    # ensure projects table is empty before adding project crawl
#Nate   dMp.flush(sTable='projects',bDebug=dConfig['debug'])
#Nate want to preserve projects table b/c I am adding the second half of corpus to it

    dMp.close()
def initBuildQueues(dConfig):

    # purge in-progress queue
    qRedis = RedisQueue(dConfig['redis-queue-building'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])
    qRedis.flush()
    
    # purge to-build queue
    qRedis = RedisQueue(dConfig['redis-queue-to-build'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])
    qRedis.flush()
Beispiel #5
0
def processProjects(dConfig):

    qRedis = RedisQueue(dConfig['redis-queue-name'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])
    oES = Elasticsearch(dConfig['es-instance-locs'])

    while 1:

        # get next project to process
        sProjectPath = qRedis.get(block=True, timeout=30)

        if sProjectPath:

            findProjectFiles( (sProjectPath, oES, dConfig) )
            
        else:

            break
def processProjects(dConfig):

    qRedis = RedisQueue(dConfig['redis-queue-name'],
                        namespace='queue',
                        host=dConfig['redis-loc'])

    while 1:

        # get next project to process
        sProjectPath = qRedis.get(block=True, timeout=30)

        if sProjectPath:

            changePerms((sProjectPath, dConfig))

        else:

            break
def initTargets(dConfig):

    # flush source targets queue
    qRedis = RedisQueue(dConfig['redis-queue-source-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])
    qRedis.flush()

    # purge build targets queue -- considering if we need to split mysql ingestion from elasticsearch queries... mysql may benefit from consumer pool inserting statements concurrently
    # qRedis = RedisQueue(dConfig['redis-queue-build-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])
    # qRedis.flush()

    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])

    dMp.open()

    # truncate sourceTargets table before re-populating
#    dMp.flush(sTable='sourceTargets', bDebug=dConfig['debug'])

    # truncate buildTargets table before re-populating
#    dMp.flush(sTable='buildTargets', bDebug=dConfig['debug'])

    dMp.close()
def findProjects(sCorpusPath, iForks, dConfig):

    lProjectPaths = []

    if dConfig['redis']:

        qRedis = RedisQueue(dConfig['redis-queue-name'],
                            namespace='queue',
                            host=dConfig['redis-loc'])

        # ensure redis queue is empty prior to starting consumers
        qRedis.flush()

    iCount = 0

    for sRoot, lDirs, lFiles in os.walk(sCorpusPath):

        iLevel = sRoot.count(os.sep)

        if iLevel >= 11:

            del lDirs[:]

        if iLevel == 11:

            if dConfig['debug']:
                debug('func: findProjects()', 'projects-root:', sRoot, iLevel)

            if dConfig['redis']:

                qRedis.put(sRoot)

            else:

                lProjectPaths.append(sRoot)

            iCount += 1

            if dConfig['debug'] and iCount >= 1: break

    printMsg('func: findProjects()', str(iCount),
             'projects loaded into queue for processing')

    return lProjectPaths
Beispiel #9
0
def main(argv):

    # defaults
    sCorpusPath = '/data/builder_SAN2/RAT'
    #    sCorpusPath = '/data/corpus_0to7'
    #    sCorpusPath = '/data/corpus_8tof'

    dConfig = {}
    dConfig['es-bulk-chunk-size'] = 500
    dConfig['debug'] = False
    # binding to muse2 doesn't work right now
    dConfig['es-instance-locs'] = ['muse1-int', 'muse2-int', 'muse3-int']
    #dConfig['es-instance-locs'] = ['muse2-int','muse3-int']
    #dConfig['es-instance-locs'] = ['muse3-int']
    dConfig['es-index-name'] = 'rat-corpus-source'
    dConfig['es-index-type'] = 'files'
    dConfig['redis-queue-name'] = 'rat-project-paths'
    dConfig['redis-loc'] = 'muse2-int'
    dConfig['redis-port'] = '12345'
    dConfig['redis'] = False

    dConfig['time-stamp'] = datetime.datetime.now().strftime(
        '%Y-%m-%d %H:%M:%S')

    iForks = 5
    bError = False

    ### command line argument handling
    options, remainder = getopt.getopt(
        sys.argv[1:], 'c:f:rd',
        ['corpus-dir-path=', 'forks=', 'redis', 'debug'])

    # debug('func: main()', 'options:', options)
    # debug('func: main()', 'remainder:', remainder)

    for opt, arg in options:

        if opt in ('-c', '--corpus-dir-path'):

            sCorpusPath = arg

        elif opt in ('-d', '--debug'):

            dConfig['debug'] = True

        elif opt in ('-r', '--redis'):

            dConfig['redis'] = True

        elif opt in ('-f', '--forks'):

            try:

                iForks = int(arg)

            except ValueError as e:

                bError = True

    if not os.path.isdir(sCorpusPath):

        bError = True

    if bError: usage()
    else:

        iStart = time.time()

        #oES = createESIndex(dConfig)
        oES = Elasticsearch(dConfig['es-instance-locs'])

        ### setup producer

        lProjectPaths = []

        if dConfig['redis']:

            qRedis = RedisQueue(dConfig['redis-queue-name'],
                                namespace='queue',
                                host=dConfig['redis-loc'],
                                port=dConfig['redis-port'])

            # ensure redis queue is empty prior to starting consumers
            # qRedis.flush()

            # call producer process that populates redis queue with project path roots

            pProducer = multiprocessing.Process(target=findProjects,
                                                args=(qRedis, sCorpusPath,
                                                      dConfig))
            pProducer.start()

        else:

            lProjectPaths = findProjects(None, sCorpusPath, dConfig)

        ### setup consumers
        lArgs = []

        iForks = 1

        if dConfig['redis']:

            # create pool of workers
            oPool = multiprocessing.Pool(processes=iForks)

            for i in range(0, iForks):

                lArgs.append(dConfig)

            ### do work -- use pool of workers to descend into each project path recording/ingesting all file names
            oPool.map(processProjects, lArgs)
            pProducer.join()

            oPool.close()
            oPool.join()

        else:

            for sPath in lProjectPaths:

                findProjectFiles((sPath, oES, dConfig))

        if dConfig['debug']: debug('func: main()', "all processes completed")

        # es index was created with replication turned off for speed, turn on replicating shards
        turnReplicationOn(oES, dConfig)

        # refresh to make the documents available for search
        oES.indices.refresh(index=dConfig['es-index-name'])

        # and now we can count the documents
        printMsg('func: main()', 'number of documents in',
                 dConfig['es-index-name'], 'index: ',
                 oES.count(index=dConfig['es-index-name'])['count'])

        iEnd = time.time()

        printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
def processBuildTargets(tTup):

    try:

        (iContainerId, dArgs, dConfig) = tTup

        # dual queues -- primary for getting what project to build next, secondary to mark what is being built
        qRedis = RedisQueue(name=dConfig['redis-queue-to-build'],
                            name2=dConfig['redis-queue-building'],
                            namespace='queue',
                            host=dConfig['redis-loc'],
                            port=dConfig['redis-port'])

        # set of existing builds for this os container used to prune out projects already built with this container
        sExistingBuilds = RedisSet(name=dConfig['redis-already-built-nate'],
                                   namespace='set',
                                   host=dConfig['redis-loc'],
                                   port=dConfig['redis-port'])

        debug('func: processBuildTargets(), has ' + str(len(sExistingBuilds)) +
              ' built projects')
        iCtr = 0

        while 1:

            sBuildTarget = qRedis.getnpush(block=True, timeout=30)
            #sBuildTarget = qRedis.peek()

            # debug(sBuildTarget)

            if sBuildTarget:

                if dConfig['debug']:
                    debug('func: processBuildTargets() sBuildTarget:',
                          sBuildTarget)

                dBuildTarget = json.loads(sBuildTarget)

                # initial setup
                #if 'projectName' not in dBuiltTarget: continue

                dArgs['projectName'] = dBuildTarget['projectName']

                if dArgs['projectName'] in sExistingBuilds:

                    warning('func: processBuildTargets() project:',
                            dArgs['projectName'],
                            ' already built... skipping...')
                    continue

                #sProjectPath = os.path.relpath(dBuildTarget['projectPath'], '/data/corpus')
                #sProjectPath = os.path.join('/nfsbuild/nfsbuild', sProjectPath)

                #dArgs['buildPath'] = sProjectPath
                dArgs['targets'] = dBuildTarget['targets']

                if dConfig['debug']:
                    debug('func: processBuildTargets() targets:',
                          json.dumps(dArgs['targets'], indent=4))

                dArgs['containerId'] = str(iContainerId)
                dArgs[
                    'containerName'] = dConfig['containerImage'] + '-' + dArgs[
                        'containerOS'] + '-' + dConfig['hostname'] + '_' + str(
                            iContainerId)

                dArgs['dirs'] = {}
                dArgs['dirs']['root'] = os.path.join(dConfig['containerPath'],
                                                     dArgs['containerName'])

                for sDir in dArgs['containerDirs']:

                    dArgs['dirs'][sDir] = os.path.join(dArgs['dirs']['root'],
                                                       sDir)

                # /data/corpus on muse2 is mounted under /nfscorpus/nfscorpus on all 3 servers (via mount-bind on muse2 and NFS on muse1 and muse3)
                debug('projectPath: ', dBuildTarget['projectPath'])
                if "_8tof" in dBuildTarget['projectPath']:
                    sProjectPath = os.path.relpath(dBuildTarget['projectPath'],
                                                   '/data/corpus_8tof')
                    sBuildPath = os.path.join('/data/builder_SAN/outputCyber',
                                              sProjectPath)
                    sProjectPath = os.path.join('/data/corpus_8tof',
                                                sProjectPath)
                if "_0to7" in dBuildTarget['projectPath']:
                    sProjectPath = os.path.relpath(dBuildTarget['projectPath'],
                                                   '/data/corpus_0to7')
                    sBuildPath = os.path.join('/data/builder_SAN/outputCyber',
                                              sProjectPath)
                    sProjectPath = os.path.join('/data/corpus_0to7',
                                                sProjectPath)
                debug('projectPathDone: ', sProjectPath)

                dArgs['buildPath'] = sBuildPath
                '''
                # determine code root in project directory
                sCodePath = dBuildTarget['buildTargetPath']
                if sCodePath.startswith('./'):
                    sCodePath = dBuildTarget['buildTargetPath'][2:]
                sCodeRoot = sCodePath[:sCodePath.index(os.sep)] if os.sep in sCodePath else sCodePath
                '''
                plist = sProjectPath.split('/')
                uuid = plist[len(plist) - 1]
                tar = uuid + ("_code.tgz")
                debug('tarball: ', tar)

                dArgs['projectPath'] = os.path.join(sProjectPath, tar)

                # add code root to project path
                #                if dBuildTarget['codeDir']:
                #                    print('none')
                #dArgs['projectPath'] = os.path.join(sProjectPath, dBuildTarget['codeDir'])

                #                else:

                #                   warning('func: processBuildTargets() encountered project:', dBuildTarget['projectName'], ' with empty or NULL codeDir which is not supported. Project build skipped...')
                #                    continue

                sTimeStamp = datetime.datetime.now().strftime('%Y%m%dT%H%M%S')
                dArgs['jsonName'] = 'build-' + sTimeStamp + '.json'
                dArgs['tarName'] = dArgs[
                    'projectName'] + '-' + sTimeStamp + '.tgz'
                dArgs['version'] = dBuildTarget['version']

                # setup container
                makeDirs(dArgs=dArgs, bDebug=dConfig['debug'])
                copySource(dArgs=dArgs, bDebug=dConfig['debug'])
                copyScripts(dArgs=dArgs, bDebug=dConfig['debug'])
                createBuildPlanScript(dArgs=dArgs, bDebug=dConfig['debug'])
                recordProjectName(dArgs=dArgs, bDebug=dConfig['debug'])
                startBuild(dArgs=dArgs, bDebug=dConfig['debug'])

                # sleep until build completes
                while pollBuild(dArgs=dArgs, bDebug=dConfig['debug']):

                    if dConfig['debug']:
                        debug(
                            'func: processBuildTargets() build not completed... sleeping'
                        )
                    time.sleep(10)

                # get container logs
                getBuildLogs(dArgs=dArgs, bDebug=dConfig['debug'])

                # get build output
                dBuffer = parseBuildOutput(dArgs=dArgs,
                                           bDebug=dConfig['debug'])

                # index build output
                postBuildStatusUpdates(dArgs=dArgs,
                                       dBuffer=dBuffer,
                                       dConfig=dConfig)

                # archive build artifacts
                tarUpContainerDirs(dArgs=dArgs, bDebug=dConfig['debug'])

                # remove container
                removeContainer(dArgs=dArgs, bDebug=dConfig['debug'])

                # remove project from "building" queue
                # qRedis.done(value=sBuildTarget)

                iCtr += 1

                if dConfig['debug'] and iCtr >= 1:

                    break

            else:

                break

        if dConfig['debug']:

            debug(
                'func: processBuildTargets() sBuildTarget is either empty or none, likely since the redis queue is empty'
            )
            debug('func: processBuildTargets() redis queue size:',
                  qRedis.size())
            debug('func: processBuildTargets() exiting...')

    except Exception as e:

        warning('Caught exception in worker thread:', iContainerId)
        traceback.print_exc()
        raise e
def processBuildTargets(tTup):

    (iContainerId, dArgs, dConfig) = tTup

    # dual queues -- primary for getting what project to build next, secondary to mark what is being built
    qRedis = RedisQueue(name=dConfig['redis-queue-to-build'],
                        name2=dConfig['redis-queue-building'],
                        namespace='queue',
                        host=dConfig['redis-loc'],
                        port=dConfig['redis-port'])

    iCtr = 0

    while 1:

        sBuildTarget = qRedis.getnpush(block=True, timeout=30)
        #sBuildTarget = qRedis.peek()

        # debug(sBuildTarget)

        if sBuildTarget:

            if dConfig['debug']:
                debug('func: processBuildTargets() sBuildTarget:',
                      sBuildTarget)

            dBuildTarget = json.loads(sBuildTarget)

            # initial setup

            sProjectPath = os.path.relpath(dBuildTarget['projectPath'],
                                           '/data/corpus')
            sProjectPath = os.path.join('/nfsbuild/nfsbuild', sProjectPath)

            dArgs['buildPath'] = sProjectPath
            dArgs['buildTargetPath'] = dBuildTarget['buildTargetPath']

            dArgs['buildType'] = dConfig['search-strings'][os.path.basename(
                dArgs['buildTargetPath'])]

            if dConfig['debug']:
                debug('func: processBuildTargets() dArgs[\'buildType\']:',
                      dArgs['buildType'])

            dArgs['containerId'] = str(iContainerId)
            dArgs['containerName'] = dConfig['containerImage'] + '-' + dArgs[
                'containerOS'] + '-' + dArgs['buildType'] + '-' + dConfig[
                    'hostname'] + '_' + str(iContainerId)

            dArgs['dirs'] = {}
            dArgs['dirs']['root'] = os.path.join(dConfig['containerPath'],
                                                 dArgs['containerName'])

            for sDir in dArgs['containerDirs']:

                dArgs['dirs'][sDir] = os.path.join(dArgs['dirs']['root'], sDir)

            dArgs['projectName'] = dBuildTarget['projectName']

            # /data/corpus on muse2 is mounted under /nfscorpus/nfscorpus on all 3 servers (via mount-bind on muse2 and NFS on muse1 and muse3)
            sProjectPath = os.path.relpath(dBuildTarget['projectPath'],
                                           '/data/corpus')
            sProjectPath = os.path.join('/nfscorpus/nfscorpus', sProjectPath)
            dArgs['projectPath'] = os.path.join(sProjectPath, 'latest')

            sTimeStamp = datetime.datetime.now().strftime('%Y%m%dT%H%M%S')
            dArgs['tarName'] = dArgs['projectName'] + '-' + sTimeStamp + '.tgz'

            dArgs['version'] = dBuildTarget['version']

            # setup container
            makeDirs(dArgs=dArgs, bDebug=dConfig['debug'])
            copySource(dArgs=dArgs, bDebug=dConfig['debug'])
            copyScripts(dArgs=dArgs, bDebug=dConfig['debug'])
            recordProjectName(dArgs=dArgs, bDebug=dConfig['debug'])
            startBuild(dArgs=dArgs, bDebug=dConfig['debug'])

            # sleep until build completes
            while pollBuild(dArgs=dArgs, bDebug=dConfig['debug']):

                if dConfig['debug']:
                    debug(
                        'func: processBuildTargets() build not completed... sleeping'
                    )
                time.sleep(10)

            # get build output
            dBuffer = parseBuildOutput(dArgs=dArgs, bDebug=dConfig['debug'])

            # index build output
            postBuildStatusUpdates(dArgs=dArgs,
                                   dBuffer=dBuffer,
                                   dConfig=dConfig)

            # archive build artifacts
            tarUpContainerDirs(dArgs=dArgs, bDebug=dConfig['debug'])

            # remove container
            removeContainer(dArgs=dArgs, bDebug=dConfig['debug'])

            # remove project from "building" queue
            # qRedis.done(value=sBuildTarget)

            iCtr += 1

            if dConfig['debug'] and iCtr >= 10:

                break

        else:

            break

    if dConfig['debug']:

        debug(
            'func: processBuildTargets() sBuildTarget is either empty or none, likely since the redis queue is empty'
        )
        debug('func: processBuildTargets() redis queue size:', qRedis.size())
        debug('func: processBuildTargets() exiting...')
def queueUpSourceTargets(dConfig):

    if dConfig['mysql'] and dConfig['redis']:

        dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
        
        # setup to-build queue
        qRedis = RedisQueue(dConfig['redis-queue-to-build'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])
        
        dMp.open()

        # get projects first to iterate through (makes it easier to build project specific dictionaries), limit if in debug mode
        iProjectCount = 0
        iTargetCount = 0
        iMultiTargets = 0

        sLimitClause = ''

        if dConfig['debug']: sLimitClause = '10'
        
        lLeadingPaths = []
        
        dProject = {}

        dCodeDirLookup = {}
        lProjectRows = dMp.select(sSelectClause='projectName,codeDir', sTable='availableProjects', bDebug=dConfig['debug'])
        for tProjectRow in lProjectRows:

            (sProjectName, sCodeDir) = tProjectRow
            dCodeDirLookup[sProjectName] = sCodeDir
        
        lTargetRows = []

        if dConfig['unBuiltProjectsOnly']:

            if dConfig['queueSite']:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='unBuiltSourceTargetsWithSite', sWhereClause='site=\'' + dConfig['queueSite'] + '\'', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

            else:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='unBuiltSourceTargets', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

        else:

            if dConfig['queueSite']:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='availableSourceTargetsWithSite', sWhereClause='site=\'' + dConfig['queueSite'] + '\'', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

            else:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='availableSourceTargets', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

        dMp.close()

        for tTargetRow in lTargetRows:

            dTarget = {}

            (sProjectName, sProjectPath, dTarget['buildTargetPath'], ) = tTargetRow

            (_, sFileExt) = os.path.splitext( os.path.basename(dTarget['buildTargetPath']) )

            if sFileExt:

                sFileExt = sFileExt.lower()

                if sFileExt in dConfig['source-targets'].keys():

                    dTarget['buildType'] = dConfig['source-targets'][sFileExt]

                    (sLeadingPath, sTarget) = os.path.split(dTarget['buildTargetPath'])

                    # NATE remove leading tarball from path
                    sLeadingPath = re.sub(r'[a-zA-Z_0-9-_]*.tgz/', "", sLeadingPath)
                    dTarget['buildTargetPath'] = os.path.join(sLeadingPath, sTarget)

                    # NATE added to grab code directory from buildTargetPath
                    bPath=sLeadingPath.split('/')
                    if len(bPath) > 1 :
                        codedir2=bPath[0]

                    iTargetCount += 1

                    if 'projectName' in dProject :

                        if dProject['projectName'] != sProjectName:

                            # new project encountered, push old project onto queue
                            if dConfig['debug']: debug('func: queueUpSourceTargets() queuing project:', json.dumps(dProject, indent=4))
                            qRedis.put(json.dumps(dProject))
                            iProjectCount += 1
                            if len(lLeadingPaths) > 1:
                                iMultiTargets += 1

                            dProject = {
                                'projectName': sProjectName,
                                'projectPath': sProjectPath,
                                'version': dConfig['version'],
                                'targets': [ dTarget ],
                                'codeDir': codedir2
                                #'codeDir': dCodeDirLookup[sProjectName]
                            }

                            lLeadingPaths = [ sLeadingPath ]

                        else:

                            if sLeadingPath not in lLeadingPaths:

                                dProject['targets'].append(dTarget)
                                lLeadingPaths.append(sLeadingPath)

                            else: 

                                iTargetCount += -1
                                if dConfig['debug']: debug('func: queueUpSourceTargets() already encountered path:',  sLeadingPath, 'not adding:', json.dumps(dTarget, indent=4))

                    else:

                        dProject = {
                            'projectName': sProjectName,
                            'projectPath': sProjectPath,
                            'version': dConfig['version'],
                            'targets': [ dTarget ],
                            'codeDir': dCodeDirLookup[sProjectName]
                        }

                        lLeadingPaths = [ sLeadingPath ]

                else:

                    warning('func: queueUpSourceTargets() unknown C/C++ file extension encountered:', sFileExt, 'file-path:',dTarget['buildTargetPath'],'for project:', sProjectName)

            else:

                warning('func: queueUpSourceTargets() missing file extension encountered file-path:') #,dTarget['buildTargetPath'],'for project:', sProjectName)


        if dConfig['debug']: debug('func: queueUpSourceTargets() queuing project:', json.dumps(dProject, indent=4))

        qRedis.put(json.dumps(dProject))
        iProjectCount += 1        
        if len(lLeadingPaths) > 1:
            iMultiTargets += 1

        printMsg('func: queueUpSourceTargets()', str(iProjectCount), 'projects queued', str(iTargetCount), 'targets queued', str(iMultiTargets), 'multi-target projects queued')
        printMsg('func: queueUpSourceTargets()', qRedis.size(), 'projects reported by redis')
def processProjects(dConfig):

    qRedis = RedisQueue(dConfig['redis-queue-project-paths'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
    dMp.open()

    lProjects = []

    iCount = 0

    while 1:

        sRoot = qRedis.get(block=True, timeout=30)

        if sRoot:

            dProject = {
                '_index': dConfig['es-project-index-name'],
                '_type': dConfig['es-project-index-type'],
                '_source': {}
            }

            dProject['_id'] = os.path.basename(sRoot)
            dProject['_source']['name'] = os.path.basename(sRoot)
            debug('func: processProjects() projects-root:', sRoot) 

            if dConfig['debug']: 

                debug('func: processProjects() projects-root:', sRoot) 
                debug('func: processProjects() projects _id and _source[name] :', dProject['_id']) 
                debug('func: processProjects() inserting project:', dProject['_source']['name'])

            if os.path.isfile( os.path.join(sRoot, 'filter.json') ):

                with open( os.path.join(sRoot, 'filter.json') ) as fProjectFilter:

                    dProjectFilter = json.load(fProjectFilter)

                    if 'hasBytecode' in dProjectFilter and dProjectFilter['hasBytecode'].lower() != 'none':
                        dProject['_source']['bytecode_available'] = True

            if os.path.isfile( os.path.join(sRoot, 'index.json') ):

                with open( os.path.join(sRoot, 'index.json') ) as fProjectIndex:
            
                    dProjectIndex = json.load(fProjectIndex)

                    if dConfig['debug']: debug('func: processProjects() dProjectIndex.keys():', json.dumps(dProjectIndex.keys(), indent=4) )

                    '''
                    if 'bytecode_available' in dProjectIndex and dProjectIndex['bytecode_available']:

                        dProject['_source']['bytecode_available'] = True
                    '''
                    if 'code' in dProjectIndex:

                        dProject['_source']['source'] = True
                        dProject['_source']['codeDir'] = dProjectIndex['code']

                        if dProject['_source']['codeDir'].startswith('./'):

                            dProject['_source']['codeDir'] = dProject['_source']['codeDir'][len('./'):]

                    if 'site' in dProjectIndex:

                        dProject['_source']['site'] = dProjectIndex['site']

                    if 'crawler_metadata' in dProjectIndex:

                        for sMetaDataFile in dProjectIndex['crawler_metadata']:

                            if 'languages.json' in sMetaDataFile:

                                sLanguageFile = os.path.join(sRoot, sMetaDataFile)

                                if os.path.isfile(sLanguageFile):

                                    with open(sLanguageFile) as fLanguageFile:

                                        dLanguageFile = json.load(fLanguageFile)

                                        if 'C' in dLanguageFile: 

                                            dProject['_source']['c'] = dLanguageFile['C']

                                        if 'C++' in dLanguageFile: 

                                            dProject['_source']['cpp'] = dLanguageFile['C++']

                                        if 'C#' in dLanguageFile:

                                            dProject['_source']['csharp'] = dLanguageFile['C#']

                                        if 'Java' in dLanguageFile: 

                                            dProject['_source']['java'] = dLanguageFile['Java']

                                        if dConfig['debug']: debug('func: findProjects() dLanguageFile:', json.dumps(dLanguageFile, indent=4) )
                                else:

                                    warning('func: processProjects()', 'languages.json file listed in index.json but does not exist for project:', dProject['_source']['name'], 'at listed location:', sLanguageFile)

            else:

                warning('func: processProjects()', 'index.json not found for project:', dProject['_source']['name'])

            lProjects.append(dProject)
            
            iCount += 1

            if (iCount % dConfig['mysql-bulk-statement-size']) == 0: 

                dMp.insertIntoProjects(lProjects=lProjects, bDebug=dConfig['debug'])
                lProjects = []

            if dConfig['debug'] and iCount >= 100: break

        else:

            break

    if dConfig['mysql']:

        if len(lProjects) > 0:

            dMp.insertIntoProjects(lProjects=lProjects, bDebug=dConfig['debug'])
            lProjects = []

        dMp.close()

    return lProjects
def findSourceTargets(dConfig):

    # setup mysql 
    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
    dMp.open()

    # purge source targets queue
    qRedis = RedisQueue(dConfig['redis-queue-source-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    lProjectRows = dMp.select(sSelectClause='projectName', sTable='cProjectsWithNoBuildTargets', bDebug=dConfig['debug'])

    dMp.close()

    debug('func: findSourceTargets() # of c projects without build targets:', len(lProjectRows) )

    iCtr = 0

    for tProjectRow in lProjectRows:

        iCtr += 1

        if dConfig['debug'] and iCtr > 10: break

        (sProjectName, ) = tProjectRow

        # debug('func: findBuildFiles() c project name:', sProjectName)

        '''
        dQuery = {
            "query": {
                "bool": {
                    "must": [
                        { "bool": {
                            "should": [
                                { "regexp": { "file.raw": ".*\.c" } },
                                { "regexp": { "file.raw": ".*\.cxx" } },
                                { "regexp": { "file.raw": ".*\.c++" } },
                                { "regexp": { "file.raw": ".*\.cc" } }
                            ]
                          }
                        },
                        {
                          "bool": {
                            "should": [
                              { "match": { "path": "latest/*" } },
                              { "match": { "path": "content/*"} }
                            ]
                          }
                        },
                        {
                          "term": { "project-name.raw": sProjectName }
                        }
                    ]
                }
            }
        }
        '''


        '''
        dQuery = {
            "query": {
                "bool": {
                    "must": [
                        { "bool": {
                            "should": [
                                { "term": { "ext.raw": "c" } },
                                { "term": { "ext.raw": "cc" } },
                                { "term": { "ext.raw": "cpp" } },
                                { "term": { "ext.raw": "cxx" } },
                                { "term": { "ext.raw": "c++" } }
                            ]
                          }
                        },
                        {
                          "bool": {
                            "should": [
                              { "match": { "path": "latest/*" } },
                              { "match": { "path": "content/*"} }
                            ]
                          }
                        },
                        {
                          "term": { "project-name.raw": sProjectName }
                        }
                    ]
                }
            }
        }
        '''

        dQuery = {
            "query": {
                "bool": {
                    "must": [
                        { "bool": {
                            "should": [
                                { "term": { "ext.raw": "c" } },
                                { "term": { "ext.raw": "cpp" } },
                                { "term": { "ext.raw": "cxx" } },
                                { "term": { "ext.raw": "c++" } },
                                { "term": { "ext.raw": "cc" } }
                            ]
                          }
                        },
                        {
                          "bool": {
                            "should": [
                              { "match": { "path": "latest/*" } },
                              { "match": { "path": "content/*"} }
                            ]
                          }
                        },
                        {
                          "term": { "project-name.raw": sProjectName }
                        }
                    ]
                }
            }
        }

        qRedis.put( json.dumps(dQuery) ) 
def main(argv):

    # defaults
    bError = False

    dConfig = {}

    dConfig['debug'] = False

    dConfig['forks'] = 5

    dConfig['mysql-db'] = 'muse'
    dConfig['mysql-user'] = '******'
    dConfig['mysql-passwd'] = 'muse'
    dConfig['mysql-loc'] = 'muse2-int'
    dConfig['mysql-port'] = 54321
    dConfig['mysql'] = True

    dConfig['redis-queue-json'] = 'muse-json'
    dConfig['redis-set'] = 'muse-projects'
    dConfig['redis-loc'] = 'muse2-int'
    # dConfig['redis-port'] = '6379'
    dConfig['redis-port'] = '12345'
    dConfig['redis'] = True

    ### command line argument handling
    options, remainder = getopt.getopt(sys.argv[1:], 'f:d',
                                       ['forks=', 'debug'])

    # debug('func: main()', 'options:', options)
    # debug('func: main()', 'remainder:', remainder)

    for opt, arg in options:

        if opt in ('-f', '--forks'):

            try:

                dConfig['forks'] = int(arg)

            except ValueError as e:

                bError = True

        elif opt in ('-d', '--debug'):

            dConfig['debug'] = True

    debug('func: main()', 'dConfig:', json.dumps(dConfig, indent=4))

    if bError: usage()
    else:

        iStart = time.time()

        # prepare redis queue for producer, flush queue before starting the producer
        qRedis = RedisQueue(dConfig['redis-queue-json'],
                            namespace='queue',
                            host=dConfig['redis-loc'],
                            port=dConfig['redis-port'])
        qRedis.flush()
        '''
        # multi-process approach
        # call producer process that populates redis queue with project path roots
        pProducer = multiprocessing.Process( target=createBuildSummaries, args=(dConfig) )
        pProducer.start()

        ### setup json writers
        lConsumerArgs = []

        for iCtr in range(0, dConfig['forks']):

            lConsumerArgs.append( (dConfig) )

        # create pool of workers 
        oConsumerPool = multiprocessing.Pool(processes=dConfig['forks'])

        ### do work -- use pool of workers to search for each search string in muse-corpus-source es index
        oConsumerPool.map(writeBuildSummaries, lConsumerArgs)

        # wait for the producer to complete
        pProducer.join()

        # wait for the consumer pool to complete
        oConsumerPool.close()
        oConsumerPool.join()
        '''
        '''
        # single process approach:
        '''
        createBuildSummaries(dConfig)
        writeBuildSummaries(dConfig)

        if dConfig['debug']: debug('func: main()', "all processes completed")

        iEnd = time.time()

        printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
def main(argv):

    # defaults
    bError = False

    dConfig = {}

    dConfig['containerImage'] = 'musebuilder'
    dConfig['containerPath'] = '/data/builder'

    dConfig['debug'] = False

    dConfig['elasticsearch'] = True
    dConfig['es-instance-locs'] = ['muse1-int', 'muse2-int', 'muse3-int']
    #dConfig['es-instance-locs'] = ['muse2-int','muse3-int']
    #dConfig['es-instance-locs'] = ['muse3-int']

    #dConfig['es-file-index-name'] = 'muse-corpus-source'
    dConfig['es-file-index-name'] = 'muse-corpus-build'
    dConfig['es-file-index-type'] = 'muse-project-build'

    dConfig['forks'] = 5

    dConfig['hostname'] = socket.gethostname().replace('.', '')

    dConfig['mysql-db'] = 'muse'
    dConfig['mysql-user'] = '******'
    dConfig['mysql-passwd'] = 'muse'
    dConfig['mysql-loc'] = 'muse2-int'
    dConfig['mysql-port'] = 54321
    dConfig['mysql'] = True

    dConfig['os'] = 'ubuntu14'

    dConfig['redis-queue-to-build'] = 'muse-to-build'
    dConfig['redis-queue-building'] = 'muse-building'
    dConfig['redis-loc'] = 'muse2-int'
    # dConfig['redis-port'] = '6379'
    dConfig['redis-port'] = '12345'
    dConfig['redis'] = True

    dConfig['search-strings'] = {
        'configure': 'configureBuildType',
        'configure.ac': 'configureacBuildType',
        'configure.in': 'configureinBuildType',
        'CMakeLists.txt': 'cmakeBuildType',
        'Makefile': 'makefileBuildType'
        #'build.xml' : 'antBuildType',
        #'pom.xml' : 'mavenBuildType'
    }

    dArgs = {}

    dArgs['buildScripts'] = {}
    dArgs['buildScripts']['root'] = '/managed/scripts'
    dArgs['buildScripts']['loader'] = os.path.join(
        dArgs['buildScripts']['root'], 'runBuild.sh')
    dArgs['buildScripts']['cmakeBuildType'] = os.path.join(
        dArgs['buildScripts']['root'], 'cmake.sh')
    dArgs['buildScripts']['configureBuildType'] = os.path.join(
        dArgs['buildScripts']['root'], 'configure.sh')
    dArgs['buildScripts']['configureacBuildType'] = os.path.join(
        dArgs['buildScripts']['root'], 'configureac.sh')
    dArgs['buildScripts']['configureinBuildType'] = os.path.join(
        dArgs['buildScripts']['root'], 'configurein.sh')
    dArgs['buildScripts']['makefileBuildType'] = os.path.join(
        dArgs['buildScripts']['root'], 'make.sh')

    dArgs['containerDirs'] = ['buildArtifacts', 'output', 'scripts', 'source']
    dArgs['containerOS'] = 'ubuntu14'
    dArgs['containerPath'] = dConfig['containerPath']

    dArgs['imageName'] = dConfig['containerImage'] + '-' + dArgs['containerOS']

    dArgs['script-name'] = 'build.sh'

    lSupportedOSs = ['ubuntu12', 'ubuntu14']

    ### command line argument handling
    options, remainder = getopt.getopt(
        sys.argv[1:], 'f:q:o:d', ['forks=', 'queue-projects=', 'os=', 'debug'])

    # debug('func: main()', 'options:', options)
    # debug('func: main()', 'remainder:', remainder)

    for opt, arg in options:

        if opt in ('-f', '--forks'):

            try:

                dConfig['forks'] = int(arg)

            except ValueError as e:

                bError = True

        elif opt in ('-o', '--os'):

            if arg in lSupportedOSs:

                dArgs['containerOS'] = arg
                dArgs['imageName'] = dConfig['containerImage'] + '-' + dArgs[
                    'containerOS']

            else:

                bError = True

        elif opt in ('-d', '--debug'):

            dConfig['debug'] = True

    if bError: usage()
    else:

        # pre-initialization -- if projects remained in building queue, put them back in queue-to-build
        qToBuildRedis = RedisQueue(name=dConfig['redis-queue-building'],
                                   name2=dConfig['redis-queue-to-build'],
                                   namespace='queue',
                                   host=dConfig['redis-loc'],
                                   port=dConfig['redis-port'])

        for iCtr in range(0, len(qToBuildRedis)):

            qToBuildRedis.getnpush()

        iStart = time.time()

        ### setup consumers

        lConsumerArgs = []

        # create a locking semaphore for mutex
        lock = multiprocessing.Lock()

        for iCtr in range(0, dConfig['forks']):

            lConsumerArgs.append((iCtr, dArgs, dConfig))

        # create pool of workers -- number of workers equals the number of search strings to be processed
        oConsumerPool = multiprocessing.Pool(processes=dConfig['forks'],
                                             initializer=initialize_lock,
                                             initargs=(lock, ))

        ### do work -- use pool of workers to search for each search string in muse-corpus-source es index
        oConsumerPool.map(processBuildTargets, lConsumerArgs)

        oConsumerPool.close()
        oConsumerPool.join()

        #processBuildTargets( (0, dArgs, dConfig) )

        if dConfig['debug']: debug('func: main()', "all processes completed")

        iEnd = time.time()

        printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
def createBuildSummaries(dConfig):

    qRedis = RedisQueue(dConfig['redis-queue-json'],
                        namespace='queue',
                        host=dConfig['redis-loc'],
                        port=dConfig['redis-port'])

    dMp = MuseProjectDB(db=dConfig['mysql-db'],
                        port=dConfig['mysql-port'],
                        user=dConfig['mysql-user'],
                        passwd=dConfig['mysql-passwd'],
                        loc=dConfig['mysql-loc'])

    sLimitClause = ''
    if dConfig['debug']: sLimitClause = '10'

    dReturnCodeLookup = {
        'buildSuccess': 'success',
        'buildPartial': 'partial',
        'buildFail': 'fail'
    }

    sSelectClause = 'projectName,projectPath,buildTarPath,buildTime,version,os,numObjectsPreBuild,numObjectsPostBuild,numObjectsGenerated,numSources,buildTargetPath,configureBuildType,configureacBuildType,configureinBuildType,cmakeBuildType,makefileBuildType,antBuildType,mavenBuildType,returnCode'

    lTargetTypes = [
        'configureBuildType', 'configureacBuildType', 'configureinBuildType',
        'cmakeBuildType', 'makefileBuildType', 'antBuildType', 'mavenBuildType'
    ]

    dMp.open()

    iProjectCount = 0

    dProjects = {
        'success':
        RedisSet(dConfig['redis-set'] + '-success',
                 namespace='set',
                 host=dConfig['redis-loc'],
                 port=dConfig['redis-port']),
        'partial':
        RedisSet(dConfig['redis-set'] + '-partial',
                 namespace='set',
                 host=dConfig['redis-loc'],
                 port=dConfig['redis-port']),
        'fail':
        RedisSet(dConfig['redis-set'] + '-fail',
                 namespace='set',
                 host=dConfig['redis-loc'],
                 port=dConfig['redis-port'])
    }

    for sTable, sProjectBin in dReturnCodeLookup.iteritems():

        # empty redis set
        dProjects[sProjectBin].flush()

        lProjects = dMp.select(sSelectClause='projectName',
                               sTable=sTable,
                               sOrderByClause='projectName',
                               sLimitClause=sLimitClause,
                               bDebug=dConfig['debug'])

        # populate redis set with projects of each bin type
        for tProject in lProjects:

            (sProjectName, ) = tProject

            dProjects[sProjectBin].put(sProjectName)

    dProjectSummary = {}

    lTargetRows = dMp.select(sSelectClause=sSelectClause,
                             sTable='buildStatusWithTargets',
                             sOrderByClause='projectName,buildTarPath',
                             sLimitClause=sLimitClause,
                             bDebug=dConfig['debug'])

    for tTargetRow in lTargetRows:

        dTarget = {}

        (dTarget['projectName'], dTarget['projectPath'],
         dTarget['buildTarPath'], dTarget['buildTime'], dTarget['version'],
         dTarget['os'], dTarget['numObjectsPreBuild'],
         dTarget['numObjectsPostBuild'], dTarget['numObjectsGenerated'],
         dTarget['numSources'], dTarget['buildTargetPath'],
         dTarget['configureBuildType'], dTarget['configureacBuildType'],
         dTarget['configureinBuildType'], dTarget['cmakeBuildType'],
         dTarget['makefileBuildType'], dTarget['antBuildType'],
         dTarget['mavenBuildType'], dTarget['returnCode']) = tTargetRow

        if dProjectSummary:

            if dProjectSummary['projectName'] == dTarget['projectName']:

                try:

                    (dBuild for dBuild in dProjectSummary['builds']
                     if dBuild['buildTarPath'] == dTarget['buildTarPath']
                     ).next()

                except (StopIteration) as e:

                    dBuild = {
                        'buildTarPath': dTarget['buildTarPath'],
                        'buildTime': dTarget['buildTime'],
                        'version': dTarget['version'],
                        'os': dTarget['os'],
                        'numObjectsPreBuild': dTarget['numObjectsPreBuild'],
                        'numObjectsPostBuild': dTarget['numObjectsPostBuild'],
                        'numObjectsGenerated': dTarget['numObjectsGenerated'],
                        'numSources': dTarget['numSources'],
                        'targets': []
                    }

                    dProjectSummary['builds'].append(dBuild)

                dTargetSummary = {
                    'buildTargetPath': dTarget['buildTargetPath'],
                    'returnCode': dTarget['returnCode']
                }

                for sTargetType in lTargetTypes:

                    if dTarget[sTargetType] == 1:

                        dTargetSummary['target-type'] = sTargetType
                        break

                dBuild['targets'].append(dTargetSummary)

            else:

                if dConfig['debug']:
                    debug('func: createBuildSummaries() dProjectSummary:',
                          json.dumps(dProjectSummary, indent=4))
                qRedis.put(json.dumps(dProjectSummary))
                iProjectCount += 1
                dProjectSummary = {}

        if not dProjectSummary:

            # project specific build summary info

            dBuild = {
                'buildTarPath': dTarget['buildTarPath'],
                'buildTime': dTarget['buildTime'],
                'version': dTarget['version'],
                'os': dTarget['os'],
                'numObjectsPreBuild': dTarget['numObjectsPreBuild'],
                'numObjectsPostBuild': dTarget['numObjectsPostBuild'],
                'numObjectsGenerated': dTarget['numObjectsGenerated'],
                'numSources': dTarget['numSources'],
                'targets': []
            }

            dProjectSummary = {
                'projectName': dTarget['projectName'],
                'sourcePath': dTarget['projectPath'],
                'builds': [dBuild]
            }

            if dTarget['projectName'] in dProjects['success']:
                dProjectSummary['buildStatus'] = 'success'
            elif dTarget['projectName'] in dProjects['partial']:
                dProjectSummary['buildStatus'] = 'partial'
            elif dTarget['projectName'] in dProjects['fail']:
                dProjectSummary['buildStatus'] = 'fail'

            # target specific build summary info

            dTargetSummary = {
                'buildTargetPath': dTarget['buildTargetPath'],
                'returnCode': dTarget['returnCode']
            }

            for sTargetType in lTargetTypes:

                if dTarget[sTargetType] == 1:

                    dTargetSummary['target-type'] = sTargetType
                    break

            dBuild['targets'].append(dTargetSummary)

    if dProjectSummary:

        if dConfig['debug']:
            debug('func: createBuildSummaries() dProjectSummary:',
                  json.dumps(dProjectSummary, indent=4))
        qRedis.put(json.dumps(dProjectSummary))
        iProjectCount += 1

        dProjectSummary = {}

    dMp.close()

    printMsg('func: createBuildSummaries()', str(iProjectCount),
             'projects queued')
def indexSourceTargets(dConfig):

    # setup mysql client
    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
    dMp.open()

    # setup elasticsearch client
    oES = Elasticsearch(dConfig['es-instance-locs'])

    # setup source targets queue
    qRedis = RedisQueue(dConfig['redis-queue-source-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    while 1:

        sQuery = qRedis.get(block=True, timeout=30)

        if sQuery:

            dQuery = json.loads(sQuery)

            if dConfig['debug']: debug( 'func: indexSourceTargets() dQuery:', json.dumps(dQuery) ) 

            lSourceFiles = []

            # scroll time set to 10 minutes, change as needed -- required for consistent results, the scroll token expires at the end of scroll time

            dResponse = oES.search(index=dConfig['es-file-index-name'], doc_type=dConfig['es-file-index-type'], body=json.dumps(dQuery), search_type='scan', scroll='20m', timeout='20m', lowercase_expanded_terms=False)
            sScrollId = dResponse['_scroll_id']

            if dConfig['debug']: debug('func: indexSourceTargets() (after initial search) dResponse: ', dResponse)

            if dConfig['debug']: debug('func: indexSourceTargets() search hits: ', dResponse['hits']['total'])

            #while not dResponse['timed_out'] and dResponse['hits']['hits']['total'] > 0:
            while 'timed_out' in dResponse and not dResponse['timed_out'] and 'hits' in dResponse and 'total' in dResponse['hits'] and dResponse['hits']['total'] > 0:

                dResponse = oES.scroll(scroll_id=sScrollId, scroll='20m')

                sScrollId = dResponse['_scroll_id']

                if ('hits' in dResponse['hits']) and (len(dResponse['hits']['hits']) > 0):

                    if dConfig['debug']: debug('func: indexSourceTargets() scroll_id:', sScrollId, 'number of hits:', len(dResponse['hits']['hits']) )

                    for dHit in dResponse['hits']['hits']:

                        # found matches

                        try:

                            if '_source' in dHit:

                                # debug('func: indexSourceTargets() dHit:', json.dumps(dHit['_source']) )
                                #NATE added, remove leading path from found built targets
                                mBuildTarget=dHit['_source']['file'];
                                mBuildTarget=mBuildTarget.split('/')
                                dHit['_source']['file'] = mBuildTarget[len(mBuildTarget)-1]

                                dProjectFound = {}

                                lSourceTypes = dMp.getSourceTypes()
                                for sSourceType in lSourceTypes:

                                    dProjectFound[sSourceType] = False

                                if 'file' in dHit['_source'] and dHit['_source']['file']:

                                    (sFileName, sFileExt) = os.path.splitext(dHit['_source']['file']) 

                                    if sFileExt.lower() in dConfig['source-targets'].keys():

                                        dProjectFound[ dConfig['source-targets'][ sFileExt.lower() ] ] = True

                                else: 

                                    warning( 'func indexSourceTargets() es returned an improper source target:', json.dumps(dHit['_source']) )
                                    continue

                                if 'project-name' in dHit['_source'] and dHit['_source']['project-name']: dProjectFound['projectName'] = dHit['_source']['project-name']
                                if 'project-path' in dHit['_source'] and dHit['_source']['project-path']: dProjectFound['projectPath'] = dHit['_source']['project-path']
                                if 'path' in dHit['_source'] and dHit['_source']['path']: 

                                    dProjectFound['buildTargetPath'] = verifyEncoding( dHit['_source']['path'] )

                                # debug('func findSourceFileHelper()', json.dumps(dProjectFound))

                                lSourceFiles.append(dProjectFound)

                                # causing es reads to time out
            
                                if (len(lSourceFiles) > dConfig['mysql-bulk-statement-size']) and dConfig['mysql']:

                                    dMp.insertIntoSourceTargets(lTargets=lSourceFiles, bDebug=dConfig['debug'])
                                    printMsg('func indexSourceTargets() loaded', iCtr, 'source targets')

                                    lSourceFiles = []

                        except (UnicodeDecodeError, UnicodeEncodeError) as e:
                            
                            warning('func indexSourceTargets() encountered exception:', e)
                            #warning('func indexSourceTargets() with string: ', dHit['_source']['path'])
                            warning('func indexSourceTargets() full _source payload: ', json.dumps( dHit['_source'], indent=4 ) )

                else:

                    break

                if (len(lSourceFiles) > 0) and dConfig['mysql']:

                    dMp.insertIntoSourceTargets(lTargets=lSourceFiles, bDebug=dConfig['debug'])
                        
                    lSourceFiles = []

        else:

            break

    dMp.close()