Beispiel #1
0
def main(argv):

    dConfig = {}

    dConfig['debug'] = False

    dConfig['es-instance-locs'] = ['muse1-int', 'muse2-int', 'muse3-int']
    #dConfig['es-instance-locs'] = ['muse1-int','muse2-int']
    #dConfig['es-instance-locs'] = ['muse3-int']

    dConfig['es-project-index-name'] = 'corpuslite'
    dConfig['es-project-index-type'] = 'projects'

    ### command line argument handling
    options, remainder = getopt.getopt(sys.argv[1:], 'd', ['debug'])

    # debug('func: main()', 'options:', options)
    # debug('func: main()', 'remainder:', remainder)

    for opt, arg in options:

        if opt in ('-d', '--debug'):

            dConfig['debug'] = True

    iStart = time.time()

    findProjectsWrapper(dConfig)

    iEnd = time.time()

    printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
Beispiel #2
0
    def getObjects(self):

        while self.rq:

            item = self.rq.get()

            printMsg(self.sConsumerName, ':item:', item)
Beispiel #3
0
def findProjects(qRedis, sCorpusPath, dConfig):

    lProjectPaths = []

    iCount = 0

    for sRoot, lDirs, lFiles in os.walk(sCorpusPath):

        iLevel = sRoot.count(os.sep)

        if iLevel >= 11:

            del lDirs[:]

        if iLevel == 11 and "github" not in sRoot:

            if dConfig['debug']:
                debug('func: findProjects()', 'projects-root:', sRoot, iLevel)

            if dConfig['redis']:
                qRedis.put(sRoot)

            else:

                lProjectPaths.append(sRoot)

            iCount += 1

            if dConfig['debug'] and iCount >= 10: break

    printMsg('func: findProjects()', str(iCount),
             'projects loaded into queue for processing')

    return lProjectPaths
Beispiel #4
0
def main(argv):

    iForks = 10
    iStart = time.time()

    ### setup consumers

    lConsumerArgs = []

    # create a locking semaphore for mutex
    lock = multiprocessing.Lock()

    for iCtr in range(0, iForks):

        lConsumerArgs.append(("lock testing procId", iCtr))

    # create pool of workers -- number of workers equals the number of search strings to be processed
    oConsumerPool = multiprocessing.Pool(processes=iForks,
                                         initializer=initialize_lock,
                                         initargs=(lock, ))

    ### do work -- use pool of workers to search for each search string in muse-corpus-source es index
    oConsumerPool.map(test, lConsumerArgs)

    oConsumerPool.close()
    oConsumerPool.join()

    # processBuildTargets( (dSearchStrings[ dConfig['queueBuildType'] ], 0, dArgs, dConfig) )

    debug('func: main()', "all processes completed")

    iEnd = time.time()

    printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
def findProjects(sCorpusPath, dConfig):

    qRedis = RedisQueue(dConfig['redis-queue-project-paths'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    iCount = 0

    for sRoot, lDirs, lFiles in os.walk(sCorpusPath):

        iLevel = sRoot.count(os.sep)

        if iLevel >= 11:

            del lDirs[:]

        if iLevel == 11:

            if dConfig['debug']: debug('func: findProjects()', 'projects-root:', sRoot, iLevel)
            debug('func: findProjects()', 'projects-root:', sRoot, iLevel)
            
            qRedis.put(sRoot)
            
            iCount += 1

            if dConfig['debug'] and iCount >= 10: break

    printMsg('func: findProjects()', str(iCount), 'projects loaded into queue for processing')
    def getObjects(self):

        bDone = False

        while not bDone:

            item = self.rq.get()

            printMsg(self.sConsumerName, ':item:', item)

            if item == 'done':

                bDone = True
def main(argv):

    dMp = MuseProjectDB()
    dMp.open()

    (lMultipleSameTypeProjects, lMultipleBuildTypeProjects) = dMp.findMultipleBuildTypeProjects()

    printMsg ( '# of same-type projects: ', len(lMultipleSameTypeProjects), '# of multiple-build-type projects:', len(lMultipleBuildTypeProjects) )

    with open('multipleSameTypeProjects.json', 'w') as fSameType:
        json.dump(lMultipleSameTypeProjects, fSameType, indent=4)

    with open('multipleBuildTypeProjects.json', 'w') as fMultipleType:
        json.dump(lMultipleBuildTypeProjects, fMultipleType, indent=4)

    dMp.close()        
def findProjects(sCorpusPath, iForks, dConfig):

    lProjectPaths = []

    if dConfig['redis']:

        qRedis = RedisQueue(dConfig['redis-queue-name'],
                            namespace='queue',
                            host=dConfig['redis-loc'])

        # ensure redis queue is empty prior to starting consumers
        qRedis.flush()

    iCount = 0

    for sRoot, lDirs, lFiles in os.walk(sCorpusPath):

        iLevel = sRoot.count(os.sep)

        if iLevel >= 11:

            del lDirs[:]

        if iLevel == 11:

            if dConfig['debug']:
                debug('func: findProjects()', 'projects-root:', sRoot, iLevel)

            if dConfig['redis']:

                qRedis.put(sRoot)

            else:

                lProjectPaths.append(sRoot)

            iCount += 1

            if dConfig['debug'] and iCount >= 1: break

    printMsg('func: findProjects()', str(iCount),
             'projects loaded into queue for processing')

    return lProjectPaths
def createBuildSummaries(dConfig):

    qRedis = RedisQueue(dConfig['redis-queue-json'],
                        namespace='queue',
                        host=dConfig['redis-loc'],
                        port=dConfig['redis-port'])

    dMp = MuseProjectDB(db=dConfig['mysql-db'],
                        port=dConfig['mysql-port'],
                        user=dConfig['mysql-user'],
                        passwd=dConfig['mysql-passwd'],
                        loc=dConfig['mysql-loc'])

    sLimitClause = ''
    if dConfig['debug']: sLimitClause = '10'

    dReturnCodeLookup = {
        'buildSuccess': 'success',
        'buildPartial': 'partial',
        'buildFail': 'fail'
    }

    sSelectClause = 'projectName,projectPath,buildTarPath,buildTime,version,os,numObjectsPreBuild,numObjectsPostBuild,numObjectsGenerated,numSources,buildTargetPath,configureBuildType,configureacBuildType,configureinBuildType,cmakeBuildType,makefileBuildType,antBuildType,mavenBuildType,returnCode'

    lTargetTypes = [
        'configureBuildType', 'configureacBuildType', 'configureinBuildType',
        'cmakeBuildType', 'makefileBuildType', 'antBuildType', 'mavenBuildType'
    ]

    dMp.open()

    iProjectCount = 0

    dProjects = {
        'success':
        RedisSet(dConfig['redis-set'] + '-success',
                 namespace='set',
                 host=dConfig['redis-loc'],
                 port=dConfig['redis-port']),
        'partial':
        RedisSet(dConfig['redis-set'] + '-partial',
                 namespace='set',
                 host=dConfig['redis-loc'],
                 port=dConfig['redis-port']),
        'fail':
        RedisSet(dConfig['redis-set'] + '-fail',
                 namespace='set',
                 host=dConfig['redis-loc'],
                 port=dConfig['redis-port'])
    }

    for sTable, sProjectBin in dReturnCodeLookup.iteritems():

        # empty redis set
        dProjects[sProjectBin].flush()

        lProjects = dMp.select(sSelectClause='projectName',
                               sTable=sTable,
                               sOrderByClause='projectName',
                               sLimitClause=sLimitClause,
                               bDebug=dConfig['debug'])

        # populate redis set with projects of each bin type
        for tProject in lProjects:

            (sProjectName, ) = tProject

            dProjects[sProjectBin].put(sProjectName)

    dProjectSummary = {}

    lTargetRows = dMp.select(sSelectClause=sSelectClause,
                             sTable='buildStatusWithTargets',
                             sOrderByClause='projectName,buildTarPath',
                             sLimitClause=sLimitClause,
                             bDebug=dConfig['debug'])

    for tTargetRow in lTargetRows:

        dTarget = {}

        (dTarget['projectName'], dTarget['projectPath'],
         dTarget['buildTarPath'], dTarget['buildTime'], dTarget['version'],
         dTarget['os'], dTarget['numObjectsPreBuild'],
         dTarget['numObjectsPostBuild'], dTarget['numObjectsGenerated'],
         dTarget['numSources'], dTarget['buildTargetPath'],
         dTarget['configureBuildType'], dTarget['configureacBuildType'],
         dTarget['configureinBuildType'], dTarget['cmakeBuildType'],
         dTarget['makefileBuildType'], dTarget['antBuildType'],
         dTarget['mavenBuildType'], dTarget['returnCode']) = tTargetRow

        if dProjectSummary:

            if dProjectSummary['projectName'] == dTarget['projectName']:

                try:

                    (dBuild for dBuild in dProjectSummary['builds']
                     if dBuild['buildTarPath'] == dTarget['buildTarPath']
                     ).next()

                except (StopIteration) as e:

                    dBuild = {
                        'buildTarPath': dTarget['buildTarPath'],
                        'buildTime': dTarget['buildTime'],
                        'version': dTarget['version'],
                        'os': dTarget['os'],
                        'numObjectsPreBuild': dTarget['numObjectsPreBuild'],
                        'numObjectsPostBuild': dTarget['numObjectsPostBuild'],
                        'numObjectsGenerated': dTarget['numObjectsGenerated'],
                        'numSources': dTarget['numSources'],
                        'targets': []
                    }

                    dProjectSummary['builds'].append(dBuild)

                dTargetSummary = {
                    'buildTargetPath': dTarget['buildTargetPath'],
                    'returnCode': dTarget['returnCode']
                }

                for sTargetType in lTargetTypes:

                    if dTarget[sTargetType] == 1:

                        dTargetSummary['target-type'] = sTargetType
                        break

                dBuild['targets'].append(dTargetSummary)

            else:

                if dConfig['debug']:
                    debug('func: createBuildSummaries() dProjectSummary:',
                          json.dumps(dProjectSummary, indent=4))
                qRedis.put(json.dumps(dProjectSummary))
                iProjectCount += 1
                dProjectSummary = {}

        if not dProjectSummary:

            # project specific build summary info

            dBuild = {
                'buildTarPath': dTarget['buildTarPath'],
                'buildTime': dTarget['buildTime'],
                'version': dTarget['version'],
                'os': dTarget['os'],
                'numObjectsPreBuild': dTarget['numObjectsPreBuild'],
                'numObjectsPostBuild': dTarget['numObjectsPostBuild'],
                'numObjectsGenerated': dTarget['numObjectsGenerated'],
                'numSources': dTarget['numSources'],
                'targets': []
            }

            dProjectSummary = {
                'projectName': dTarget['projectName'],
                'sourcePath': dTarget['projectPath'],
                'builds': [dBuild]
            }

            if dTarget['projectName'] in dProjects['success']:
                dProjectSummary['buildStatus'] = 'success'
            elif dTarget['projectName'] in dProjects['partial']:
                dProjectSummary['buildStatus'] = 'partial'
            elif dTarget['projectName'] in dProjects['fail']:
                dProjectSummary['buildStatus'] = 'fail'

            # target specific build summary info

            dTargetSummary = {
                'buildTargetPath': dTarget['buildTargetPath'],
                'returnCode': dTarget['returnCode']
            }

            for sTargetType in lTargetTypes:

                if dTarget[sTargetType] == 1:

                    dTargetSummary['target-type'] = sTargetType
                    break

            dBuild['targets'].append(dTargetSummary)

    if dProjectSummary:

        if dConfig['debug']:
            debug('func: createBuildSummaries() dProjectSummary:',
                  json.dumps(dProjectSummary, indent=4))
        qRedis.put(json.dumps(dProjectSummary))
        iProjectCount += 1

        dProjectSummary = {}

    dMp.close()

    printMsg('func: createBuildSummaries()', str(iProjectCount),
             'projects queued')
def main(argv):

    # defaults
    bError = False

    dConfig = {}

    dConfig['debug'] = False

    dConfig['forks'] = 5

    dConfig['mysql-db'] = 'muse'
    dConfig['mysql-user'] = '******'
    dConfig['mysql-passwd'] = 'muse'
    dConfig['mysql-loc'] = 'muse2-int'
    dConfig['mysql-port'] = 54321
    dConfig['mysql'] = True

    dConfig['redis-queue-json'] = 'muse-json'
    dConfig['redis-set'] = 'muse-projects'
    dConfig['redis-loc'] = 'muse2-int'
    # dConfig['redis-port'] = '6379'
    dConfig['redis-port'] = '12345'
    dConfig['redis'] = True

    ### command line argument handling
    options, remainder = getopt.getopt(sys.argv[1:], 'f:d',
                                       ['forks=', 'debug'])

    # debug('func: main()', 'options:', options)
    # debug('func: main()', 'remainder:', remainder)

    for opt, arg in options:

        if opt in ('-f', '--forks'):

            try:

                dConfig['forks'] = int(arg)

            except ValueError as e:

                bError = True

        elif opt in ('-d', '--debug'):

            dConfig['debug'] = True

    debug('func: main()', 'dConfig:', json.dumps(dConfig, indent=4))

    if bError: usage()
    else:

        iStart = time.time()

        # prepare redis queue for producer, flush queue before starting the producer
        qRedis = RedisQueue(dConfig['redis-queue-json'],
                            namespace='queue',
                            host=dConfig['redis-loc'],
                            port=dConfig['redis-port'])
        qRedis.flush()
        '''
        # multi-process approach
        # call producer process that populates redis queue with project path roots
        pProducer = multiprocessing.Process( target=createBuildSummaries, args=(dConfig) )
        pProducer.start()

        ### setup json writers
        lConsumerArgs = []

        for iCtr in range(0, dConfig['forks']):

            lConsumerArgs.append( (dConfig) )

        # create pool of workers 
        oConsumerPool = multiprocessing.Pool(processes=dConfig['forks'])

        ### do work -- use pool of workers to search for each search string in muse-corpus-source es index
        oConsumerPool.map(writeBuildSummaries, lConsumerArgs)

        # wait for the producer to complete
        pProducer.join()

        # wait for the consumer pool to complete
        oConsumerPool.close()
        oConsumerPool.join()
        '''
        '''
        # single process approach:
        '''
        createBuildSummaries(dConfig)
        writeBuildSummaries(dConfig)

        if dConfig['debug']: debug('func: main()', "all processes completed")

        iEnd = time.time()

        printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
def main(argv):

    # defaults
    sCorpusPath = '/data/corpus'

    dConfig = {}
    dConfig['debug'] = False
    dConfig['redis-queue-name'] = 'muse-project-paths-perms'
    dConfig['redis-loc'] = '38.100.20.212'
    dConfig['redis'] = False

    dConfig['time-stamp'] = datetime.datetime.now().strftime(
        '%Y-%m-%dT%H:%M:%S')

    iForks = 10
    bError = False

    ### command line argument handling
    options, remainder = getopt.getopt(
        sys.argv[1:], 'c:f:rd',
        ['corpus-dir-path=', 'forks=', 'redis', 'debug'])

    # debug('func: main()', 'options:', options)
    # debug('func: main()', 'remainder:', remainder)

    for opt, arg in options:

        if opt in ('-c', '--corpus-dir-path'):

            sCorpusPath = arg

        elif opt in ('-d', '--debug'):

            dConfig['debug'] = True

        elif opt in ('-r', '--redis'):

            dConfig['redis'] = True

        elif opt in ('-f', '--forks'):

            try:

                iForks = int(arg)

            except ValueError as e:

                bError = True

    if not os.path.isdir(sCorpusPath):

        bError = True

    if bError: usage()
    else:

        iStart = time.time()

        ### setup producer

        lProjectPaths = []

        if dConfig['redis']:

            # call producer process that populates redis queue with project path roots

            pProducer = multiprocessing.Process(target=findProjects,
                                                args=(sCorpusPath, iForks,
                                                      dConfig))
            pProducer.start()

        else:

            lProjectPaths = findProjects(sCorpusPath, iForks, dConfig)

        ### setup consumers
        lArgs = []

        # create pool of workers
        oPool = multiprocessing.Pool(processes=iForks)

        if dConfig['redis']:

            for i in range(0, iForks):

                lArgs.append(dConfig)

            ### do work -- use pool of workers to descend into each project path recording/ingesting all file names
            oPool.map(processProjects, lArgs)
            pProducer.join()

        else:

            for sPath in lProjectPaths:

                lArgs.append((sPath, dConfig))

            ### do work -- use pool of workers to descend into each project path recording/ingesting all file names
            oPool.map(findProjectFiles, lArgs)

        oPool.close()
        oPool.join()

        if dConfig['debug']: debug('func: main()', "all processes completed")

        iEnd = time.time()

        printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
Beispiel #12
0
def main(argv):

    # defaults
    sCorpusPath = '/data/builder_SAN2/RAT'
    #    sCorpusPath = '/data/corpus_0to7'
    #    sCorpusPath = '/data/corpus_8tof'

    dConfig = {}
    dConfig['es-bulk-chunk-size'] = 500
    dConfig['debug'] = False
    # binding to muse2 doesn't work right now
    dConfig['es-instance-locs'] = ['muse1-int', 'muse2-int', 'muse3-int']
    #dConfig['es-instance-locs'] = ['muse2-int','muse3-int']
    #dConfig['es-instance-locs'] = ['muse3-int']
    dConfig['es-index-name'] = 'rat-corpus-source'
    dConfig['es-index-type'] = 'files'
    dConfig['redis-queue-name'] = 'rat-project-paths'
    dConfig['redis-loc'] = 'muse2-int'
    dConfig['redis-port'] = '12345'
    dConfig['redis'] = False

    dConfig['time-stamp'] = datetime.datetime.now().strftime(
        '%Y-%m-%d %H:%M:%S')

    iForks = 5
    bError = False

    ### command line argument handling
    options, remainder = getopt.getopt(
        sys.argv[1:], 'c:f:rd',
        ['corpus-dir-path=', 'forks=', 'redis', 'debug'])

    # debug('func: main()', 'options:', options)
    # debug('func: main()', 'remainder:', remainder)

    for opt, arg in options:

        if opt in ('-c', '--corpus-dir-path'):

            sCorpusPath = arg

        elif opt in ('-d', '--debug'):

            dConfig['debug'] = True

        elif opt in ('-r', '--redis'):

            dConfig['redis'] = True

        elif opt in ('-f', '--forks'):

            try:

                iForks = int(arg)

            except ValueError as e:

                bError = True

    if not os.path.isdir(sCorpusPath):

        bError = True

    if bError: usage()
    else:

        iStart = time.time()

        #oES = createESIndex(dConfig)
        oES = Elasticsearch(dConfig['es-instance-locs'])

        ### setup producer

        lProjectPaths = []

        if dConfig['redis']:

            qRedis = RedisQueue(dConfig['redis-queue-name'],
                                namespace='queue',
                                host=dConfig['redis-loc'],
                                port=dConfig['redis-port'])

            # ensure redis queue is empty prior to starting consumers
            # qRedis.flush()

            # call producer process that populates redis queue with project path roots

            pProducer = multiprocessing.Process(target=findProjects,
                                                args=(qRedis, sCorpusPath,
                                                      dConfig))
            pProducer.start()

        else:

            lProjectPaths = findProjects(None, sCorpusPath, dConfig)

        ### setup consumers
        lArgs = []

        iForks = 1

        if dConfig['redis']:

            # create pool of workers
            oPool = multiprocessing.Pool(processes=iForks)

            for i in range(0, iForks):

                lArgs.append(dConfig)

            ### do work -- use pool of workers to descend into each project path recording/ingesting all file names
            oPool.map(processProjects, lArgs)
            pProducer.join()

            oPool.close()
            oPool.join()

        else:

            for sPath in lProjectPaths:

                findProjectFiles((sPath, oES, dConfig))

        if dConfig['debug']: debug('func: main()', "all processes completed")

        # es index was created with replication turned off for speed, turn on replicating shards
        turnReplicationOn(oES, dConfig)

        # refresh to make the documents available for search
        oES.indices.refresh(index=dConfig['es-index-name'])

        # and now we can count the documents
        printMsg('func: main()', 'number of documents in',
                 dConfig['es-index-name'], 'index: ',
                 oES.count(index=dConfig['es-index-name'])['count'])

        iEnd = time.time()

        printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
def indexSourceTargets(dConfig):

    # setup mysql client
    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
    dMp.open()

    # setup elasticsearch client
    oES = Elasticsearch(dConfig['es-instance-locs'])

    # setup source targets queue
    qRedis = RedisQueue(dConfig['redis-queue-source-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    while 1:

        sQuery = qRedis.get(block=True, timeout=30)

        if sQuery:

            dQuery = json.loads(sQuery)

            if dConfig['debug']: debug( 'func: indexSourceTargets() dQuery:', json.dumps(dQuery) ) 

            lSourceFiles = []

            # scroll time set to 10 minutes, change as needed -- required for consistent results, the scroll token expires at the end of scroll time

            dResponse = oES.search(index=dConfig['es-file-index-name'], doc_type=dConfig['es-file-index-type'], body=json.dumps(dQuery), search_type='scan', scroll='20m', timeout='20m', lowercase_expanded_terms=False)
            sScrollId = dResponse['_scroll_id']

            if dConfig['debug']: debug('func: indexSourceTargets() (after initial search) dResponse: ', dResponse)

            if dConfig['debug']: debug('func: indexSourceTargets() search hits: ', dResponse['hits']['total'])

            #while not dResponse['timed_out'] and dResponse['hits']['hits']['total'] > 0:
            while 'timed_out' in dResponse and not dResponse['timed_out'] and 'hits' in dResponse and 'total' in dResponse['hits'] and dResponse['hits']['total'] > 0:

                dResponse = oES.scroll(scroll_id=sScrollId, scroll='20m')

                sScrollId = dResponse['_scroll_id']

                if ('hits' in dResponse['hits']) and (len(dResponse['hits']['hits']) > 0):

                    if dConfig['debug']: debug('func: indexSourceTargets() scroll_id:', sScrollId, 'number of hits:', len(dResponse['hits']['hits']) )

                    for dHit in dResponse['hits']['hits']:

                        # found matches

                        try:

                            if '_source' in dHit:

                                # debug('func: indexSourceTargets() dHit:', json.dumps(dHit['_source']) )
                                #NATE added, remove leading path from found built targets
                                mBuildTarget=dHit['_source']['file'];
                                mBuildTarget=mBuildTarget.split('/')
                                dHit['_source']['file'] = mBuildTarget[len(mBuildTarget)-1]

                                dProjectFound = {}

                                lSourceTypes = dMp.getSourceTypes()
                                for sSourceType in lSourceTypes:

                                    dProjectFound[sSourceType] = False

                                if 'file' in dHit['_source'] and dHit['_source']['file']:

                                    (sFileName, sFileExt) = os.path.splitext(dHit['_source']['file']) 

                                    if sFileExt.lower() in dConfig['source-targets'].keys():

                                        dProjectFound[ dConfig['source-targets'][ sFileExt.lower() ] ] = True

                                else: 

                                    warning( 'func indexSourceTargets() es returned an improper source target:', json.dumps(dHit['_source']) )
                                    continue

                                if 'project-name' in dHit['_source'] and dHit['_source']['project-name']: dProjectFound['projectName'] = dHit['_source']['project-name']
                                if 'project-path' in dHit['_source'] and dHit['_source']['project-path']: dProjectFound['projectPath'] = dHit['_source']['project-path']
                                if 'path' in dHit['_source'] and dHit['_source']['path']: 

                                    dProjectFound['buildTargetPath'] = verifyEncoding( dHit['_source']['path'] )

                                # debug('func findSourceFileHelper()', json.dumps(dProjectFound))

                                lSourceFiles.append(dProjectFound)

                                # causing es reads to time out
            
                                if (len(lSourceFiles) > dConfig['mysql-bulk-statement-size']) and dConfig['mysql']:

                                    dMp.insertIntoSourceTargets(lTargets=lSourceFiles, bDebug=dConfig['debug'])
                                    printMsg('func indexSourceTargets() loaded', iCtr, 'source targets')

                                    lSourceFiles = []

                        except (UnicodeDecodeError, UnicodeEncodeError) as e:
                            
                            warning('func indexSourceTargets() encountered exception:', e)
                            #warning('func indexSourceTargets() with string: ', dHit['_source']['path'])
                            warning('func indexSourceTargets() full _source payload: ', json.dumps( dHit['_source'], indent=4 ) )

                else:

                    break

                if (len(lSourceFiles) > 0) and dConfig['mysql']:

                    dMp.insertIntoSourceTargets(lTargets=lSourceFiles, bDebug=dConfig['debug'])
                        
                    lSourceFiles = []

        else:

            break

    dMp.close()
def main(argv):

    # defaults
    bError = False

    dConfig = {}

    dConfig['containerImage'] = 'musebuilder'
    #dConfig['containerPath'] = '/data/builder'
    dConfig['containerPath'] = '/data/builder_SAN/containers'

    dConfig['debug'] = False

    dConfig['elasticsearch'] = True
    dConfig['es-instance-locs'] = ['muse1-int', 'muse2-int', 'muse3-int']
    #dConfig['es-instance-locs'] = ['muse2-int','muse3-int']
    #dConfig['es-instance-locs'] = ['muse3-int']

    #dConfig['es-file-index-name'] = 'muse-corpus-source'
    dConfig['es-file-index-name'] = 'muse-corpus-build'
    dConfig['es-file-index-type'] = 'muse-project-build'

    dConfig['forks'] = 5

    dConfig['hostname'] = socket.gethostname().replace('.', '')

    dConfig['mysql-db'] = 'muse'
    dConfig['mysql-user'] = '******'
    dConfig['mysql-passwd'] = 'muse'
    dConfig['mysql-loc'] = 'muse2-int'
    dConfig['mysql-port'] = 54321
    dConfig['mysql'] = True

    dConfig['rebuild'] = False

    dConfig['redis-already-built'] = 'muse-already-built-'
    dConfig['redis-already-built-nate'] = 'NEWbuiltProjects'
    dConfig['redis-queue-to-build'] = 'muse-to-build'
    dConfig['redis-queue-building'] = 'muse-building'
    dConfig['redis-loc'] = 'muse2-int'
    # dConfig['redis-port'] = '6379'
    dConfig['redis-port'] = '12345'
    dConfig['redis'] = True

    dArgs = {}

    # number of attempts with each to build targets to resolve dependencies
    dArgs['buildCycles'] = 2
    dArgs['containerMem'] = '2g'

    dArgs['buildScripts'] = {}
    dArgs['buildScripts']['root'] = '/managed/scripts'
    dArgs['buildScripts']['loader'] = os.path.join(
        dArgs['buildScripts']['root'], 'runBuild.sh')
    dArgs['buildScripts']['cmakeBuildType'] = os.path.join(
        dArgs['buildScripts']['root'], 'cmake.sh')
    dArgs['buildScripts']['configureBuildType'] = os.path.join(
        dArgs['buildScripts']['root'], 'configure.sh')
    dArgs['buildScripts']['configureacBuildType'] = os.path.join(
        dArgs['buildScripts']['root'], 'configureac.sh')
    dArgs['buildScripts']['configureinBuildType'] = os.path.join(
        dArgs['buildScripts']['root'], 'configurein.sh')
    dArgs['buildScripts']['makefileBuildType'] = os.path.join(
        dArgs['buildScripts']['root'], 'make.sh')

    dArgs['containerScripts'] = {}
    dArgs['containerScripts']['root'] = '/scripts'
    dArgs['containerScripts']['cmakeBuildType'] = os.path.join(
        dArgs['containerScripts']['root'], 'cmake.sh')
    dArgs['containerScripts']['configureBuildType'] = os.path.join(
        dArgs['containerScripts']['root'], 'configure.sh')
    dArgs['containerScripts']['configureacBuildType'] = os.path.join(
        dArgs['containerScripts']['root'], 'configureac.sh')
    dArgs['containerScripts']['configureinBuildType'] = os.path.join(
        dArgs['containerScripts']['root'], 'configurein.sh')
    dArgs['containerScripts']['makefileBuildType'] = os.path.join(
        dArgs['containerScripts']['root'], 'make.sh')

    dArgs['containerDirs'] = ['buildArtifacts', 'output', 'scripts', 'source']
    dArgs['containerOS'] = 'ubuntu14'
    dArgs['containerPath'] = dConfig['containerPath']

    dArgs['imageName'] = dConfig['containerImage'] + '-' + dArgs['containerOS']

    dArgs['script-name'] = 'build.sh'
    '''
    dArgs['build-targets'] = {
        'configure' : 'configureBuildType',
        'configure.ac' : 'configureacBuildType',
        'configure.in' : 'configureinBuildType',
        'CMakeLists.txt' : 'cmakeBuildType',
        'Makefile' : 'makefileBuildType'
        #'build.xml' : 'antBuildType', 
        #'pom.xml' : 'mavenBuildType'
    }
    '''

    dArgs['source-compilers'] = {'cBuildType': 'gcc', 'cppBuildType': 'g++'}
    '''
    dArgs['source-targets'] = {
        '.c' : 'cBuildType',
        '.cc' : 'cppBuildType',
        '.cpp' : 'cppBuildType',
        '.cxx' : 'cppBuildType',
        '.c++' : 'cppBuildType'
    }
    '''

    lSupportedOSs = ['fedora20', 'fedora21', 'ubuntu12', 'ubuntu14']

    ### command line argument handling
    options, remainder = getopt.getopt(
        sys.argv[1:], 'f:o:rdy',
        ['forks=', 'os=', 'rebuild', 'debug', 'debug-flags'])

    # debug('func: main()', 'options:', options)
    # debug('func: main()', 'remainder:', remainder)

    for opt, arg in options:

        if opt in ('-f', '--forks'):

            try:

                dConfig['forks'] = int(arg)

            except ValueError as e:

                bError = True

        elif opt in ('-o', '--os'):

            if arg in lSupportedOSs:

                dArgs['containerOS'] = arg
                dArgs['imageName'] = dConfig['containerImage'] + '-' + dArgs[
                    'containerOS']

            else:

                bError = True

        elif opt in ('-r', '--rebuild'):

            dConfig['rebuild'] = True

        elif opt in ('-d', '--debug'):

            dConfig['debug'] = True

        elif opt in ('-y', '--debug-flags'):

            dArgs['source-compilers'] = {
                'cBuildType': 'gcc -g3 -O0 -DDEBUG',
                'cppBuildType': 'g++ -g3 -O0 -DDEBUG'
            }

    debug('func: main()', 'dConfig:', json.dumps(dConfig, indent=4))

    if bError: usage()
    else:
        '''
        # pre-initialization -- if projects remained in building queue, put them back in queue-to-build
        qToBuildRedis = RedisQueue(name=dConfig['redis-queue-building'], name2=dConfig['redis-queue-to-build'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

        for iCtr in range(0, len(qToBuildRedis)):

            qToBuildRedis.getnpush()
        '''

        dConfig['redis-already-built'] = dConfig[
            'redis-already-built'] + dArgs['containerOS']

        sExistingBuilds = RedisSet(name=dConfig['redis-already-built'],
                                   namespace='set',
                                   host=dConfig['redis-loc'],
                                   port=dConfig['redis-port'])
        sExistingBuilds.flush()

        if not dConfig['rebuild']:

            loadExistingBuilds(dConfig, dArgs['containerOS'])

        iStart = time.time()

        ### setup consumers

        lConsumerArgs = []

        # create a locking semaphore for mutex
        lock = multiprocessing.Lock()

        for iCtr in range(0, dConfig['forks']):

            lConsumerArgs.append((iCtr, dArgs, dConfig))

        # create pool of workers -- number of workers equals the number of search strings to be processed
        oConsumerPool = multiprocessing.Pool(processes=dConfig['forks'],
                                             initializer=initialize_lock,
                                             initargs=(lock, ))

        ### do work -- use pool of workers to search for each search string in muse-corpus-source es index
        print(lConsumerArgs)

        oConsumerPool.map(processBuildTargets, lConsumerArgs)

        oConsumerPool.close()
        oConsumerPool.join()

        # processBuildTargets( (0, dArgs, dConfig) )

        if dConfig['debug']: debug('func: main()', "all processes completed")

        iEnd = time.time()

        printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
Beispiel #15
0
def main(argv):

    dConfig = {}
    dConfig['es-bulk-chunk-size'] = 500
    dConfig['debug'] = False
    dConfig['forks'] = 5
    # binding to muse2 doesn't work right now
    dConfig['es-instance-locs'] = ['38.100.20.211', '38.100.20.212']
    #dConfig['es-instance-locs'] = ['38.100.20.212']

    dConfig['es-file-index-name'] = 'muse-corpus-source'
    dConfig['es-file-index-type'] = 'muse-project-files'

    dConfig['es-project-index-name'] = 'muse-corpus-projects'
    dConfig['es-project-index-type'] = 'muse-project-buildtype'

    dConfig['redis-queue-name'] = 'muse-%s-projects'
    dConfig['redis-loc'] = '38.100.20.212'

    dConfig['time-stamp'] = datetime.datetime.now().strftime(
        '%Y-%m-%dT%H:%M:%S')
    dConfig['version'] = '1.0'

    # sSearchStrings = ['configure','configure.ac','configure.in','Makefile','build.xml','pom.xml']
    # sSearchStrings = ['configure']
    sSearchStrings = ['configure', 'configure.ac', 'configure.in']

    bError = False

    ### command line argument handling
    options, remainder = getopt.getopt(sys.argv[1:], 'c:f:d',
                                       ['corpus-dir-path=', 'forks=', 'debug'])

    # debug('func: main()', 'options:', options)
    # debug('func: main()', 'remainder:', remainder)

    for opt, arg in options:

        if opt in ('-d', '--debug'):

            dConfig['debug'] = True

        elif opt in ('-f', '--forks'):

            try:

                dConfig['forks'] = int(arg)

            except ValueError as e:

                bError = True

    if bError: usage()
    else:

        iStart = time.time()

        ### setup producers

        lProducerArgs = []

        for sSearchString in sSearchStrings:

            lProducerArgs.append((sSearchString, dConfig))

        # create pool of workers -- number of workers equals the number of search strings to be processed
        oProducerPool = multiprocessing.Pool(processes=len(lProducerArgs))

        ### do work -- use pool of workers to search for each search string in muse-corpus-source es index
        oProducerPool.map(findBuildFiles, lProducerArgs)

        oProducerPool.close()
        oProducerPool.join()

        ### setup consumers
        lConsumerArgs = []

        for i in range(0, dConfig['forks']):

            lConsumerArgs.append(dConfig)

        # create pool of workers
        ##oConsumerPool = multiprocessing.Pool(processes=iForks)

        ### do work -- use pool of workers to descend into each project path recording/ingesting all file names
        ##oConsumerPool.map(findProjectFiles, lConsumerArgs)

        ##oConsumerPool.close()
        ##oConsumerPool.join()

        if dConfig['debug']: debug('func: main()', "all processes completed")

        iEnd = time.time()

        printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
def queueUpSourceTargets(dConfig):

    if dConfig['mysql'] and dConfig['redis']:

        dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
        
        # setup to-build queue
        qRedis = RedisQueue(dConfig['redis-queue-to-build'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])
        
        dMp.open()

        # get projects first to iterate through (makes it easier to build project specific dictionaries), limit if in debug mode
        iProjectCount = 0
        iTargetCount = 0
        iMultiTargets = 0

        sLimitClause = ''

        if dConfig['debug']: sLimitClause = '10'
        
        lLeadingPaths = []
        
        dProject = {}

        dCodeDirLookup = {}
        lProjectRows = dMp.select(sSelectClause='projectName,codeDir', sTable='availableProjects', bDebug=dConfig['debug'])
        for tProjectRow in lProjectRows:

            (sProjectName, sCodeDir) = tProjectRow
            dCodeDirLookup[sProjectName] = sCodeDir
        
        lTargetRows = []

        if dConfig['unBuiltProjectsOnly']:

            if dConfig['queueSite']:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='unBuiltSourceTargetsWithSite', sWhereClause='site=\'' + dConfig['queueSite'] + '\'', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

            else:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='unBuiltSourceTargets', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

        else:

            if dConfig['queueSite']:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='availableSourceTargetsWithSite', sWhereClause='site=\'' + dConfig['queueSite'] + '\'', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

            else:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='availableSourceTargets', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

        dMp.close()

        for tTargetRow in lTargetRows:

            dTarget = {}

            (sProjectName, sProjectPath, dTarget['buildTargetPath'], ) = tTargetRow

            (_, sFileExt) = os.path.splitext( os.path.basename(dTarget['buildTargetPath']) )

            if sFileExt:

                sFileExt = sFileExt.lower()

                if sFileExt in dConfig['source-targets'].keys():

                    dTarget['buildType'] = dConfig['source-targets'][sFileExt]

                    (sLeadingPath, sTarget) = os.path.split(dTarget['buildTargetPath'])

                    # NATE remove leading tarball from path
                    sLeadingPath = re.sub(r'[a-zA-Z_0-9-_]*.tgz/', "", sLeadingPath)
                    dTarget['buildTargetPath'] = os.path.join(sLeadingPath, sTarget)

                    # NATE added to grab code directory from buildTargetPath
                    bPath=sLeadingPath.split('/')
                    if len(bPath) > 1 :
                        codedir2=bPath[0]

                    iTargetCount += 1

                    if 'projectName' in dProject :

                        if dProject['projectName'] != sProjectName:

                            # new project encountered, push old project onto queue
                            if dConfig['debug']: debug('func: queueUpSourceTargets() queuing project:', json.dumps(dProject, indent=4))
                            qRedis.put(json.dumps(dProject))
                            iProjectCount += 1
                            if len(lLeadingPaths) > 1:
                                iMultiTargets += 1

                            dProject = {
                                'projectName': sProjectName,
                                'projectPath': sProjectPath,
                                'version': dConfig['version'],
                                'targets': [ dTarget ],
                                'codeDir': codedir2
                                #'codeDir': dCodeDirLookup[sProjectName]
                            }

                            lLeadingPaths = [ sLeadingPath ]

                        else:

                            if sLeadingPath not in lLeadingPaths:

                                dProject['targets'].append(dTarget)
                                lLeadingPaths.append(sLeadingPath)

                            else: 

                                iTargetCount += -1
                                if dConfig['debug']: debug('func: queueUpSourceTargets() already encountered path:',  sLeadingPath, 'not adding:', json.dumps(dTarget, indent=4))

                    else:

                        dProject = {
                            'projectName': sProjectName,
                            'projectPath': sProjectPath,
                            'version': dConfig['version'],
                            'targets': [ dTarget ],
                            'codeDir': dCodeDirLookup[sProjectName]
                        }

                        lLeadingPaths = [ sLeadingPath ]

                else:

                    warning('func: queueUpSourceTargets() unknown C/C++ file extension encountered:', sFileExt, 'file-path:',dTarget['buildTargetPath'],'for project:', sProjectName)

            else:

                warning('func: queueUpSourceTargets() missing file extension encountered file-path:') #,dTarget['buildTargetPath'],'for project:', sProjectName)


        if dConfig['debug']: debug('func: queueUpSourceTargets() queuing project:', json.dumps(dProject, indent=4))

        qRedis.put(json.dumps(dProject))
        iProjectCount += 1        
        if len(lLeadingPaths) > 1:
            iMultiTargets += 1

        printMsg('func: queueUpSourceTargets()', str(iProjectCount), 'projects queued', str(iTargetCount), 'targets queued', str(iMultiTargets), 'multi-target projects queued')
        printMsg('func: queueUpSourceTargets()', qRedis.size(), 'projects reported by redis')
Beispiel #17
0
def findProjects(sLanguage, dConfig):

    # setup elasticsearch client
    oES = Elasticsearch(dConfig['es-instance-locs'])

    lProjects = []

    iCtr = 0

    dQuery = {"query": {"match_all": {}}, "fields": [sLanguage]}

    if dConfig['debug']:
        debug('func: findProjects() dQuery:', json.dumps(dQuery))

    # scroll time set to 10 minutes, change as needed -- required for consistent results, the scroll token expires at the end of scroll time

    dResponse = oES.search(index=dConfig['es-project-index-name'],
                           doc_type=dConfig['es-project-index-type'],
                           body=json.dumps(dQuery),
                           search_type='scan',
                           scroll='20m',
                           timeout='20m',
                           lowercase_expanded_terms=False)
    sScrollId = dResponse['_scroll_id']

    if dConfig['debug']:
        debug('func: findProjects() (after initial search) dResponse: ',
              dResponse)

    if dConfig['debug']:
        debug('func: findProjects() search hits: ', dResponse['hits']['total'])

    #while not dResponse['timed_out'] and dResponse['hits']['hits']['total'] > 0:
    while 'timed_out' in dResponse and not dResponse[
            'timed_out'] and 'hits' in dResponse and 'total' in dResponse[
                'hits'] and dResponse['hits']['total'] > 0:

        dResponse = oES.scroll(scroll_id=sScrollId, scroll='20m')

        sScrollId = dResponse['_scroll_id']

        if ('hits'
                in dResponse['hits']) and (len(dResponse['hits']['hits']) > 0):

            if dConfig['debug']:
                debug('func: findProjects() scroll_id:', sScrollId,
                      'number of hits:', len(dResponse['hits']['hits']))

            if dConfig['debug'] and iCtr > 10: break

            for dHit in dResponse['hits']['hits']:

                iCtr += 1

                if dConfig['debug']:

                    debug('func: findProjects()', json.dumps(dHit, indent=4))

                    if iCtr > 100: break

                # found matches

                if 'fields' in dHit and sLanguage in dHit[
                        'fields'] and '_id' in dHit:

                    lProjects.append(dHit['_id'])

        else:

            break

    printMsg('func: findProjects() found ', str(iCtr),
             ' buildTargets, spawned process exiting...')

    sLanguageFileName = './' + sLanguage.split('.')[1] + '.txt'

    printMsg('func: findProjects() file created: ', sLanguageFileName)

    with open(sLanguageFileName, 'w') as fLanguage:
        for sProject in sorted(lProjects):
            fLanguage.write(sProject + '\n')

    return lProjects
def main(argv):

    # defaults
    bError = False
    sCorpusPath = '/data/corpus_0to7'

    dConfig = {}

    dConfig['analyze-projects'] = False
    dConfig['crawl-projects'] = False

    dConfig['debug'] = False

    dConfig['es-bulk-chunk-size'] = 500
    dConfig['es-instance-locs'] = ['muse1-int','muse2-int','muse3-int']
    # dConfig['es-instance-locs'] = ['muse1-int','muse2-int']
    #dConfig['es-instance-locs'] = ['muse3-int']
    
    dConfig['es-file-index-name'] = 'muse-corpus-source-new'
    dConfig['es-file-index-type'] = 'files'

    dConfig['es-project-index-name'] = 'muse-projects'
    dConfig['es-project-index-type'] = 'projects'

    dConfig['forks'] = 5

    dConfig['mysql-db'] = 'muse'
    dConfig['mysql-user'] = '******'
    dConfig['mysql-passwd'] = 'muse'
    dConfig['mysql-loc'] = 'muse2-int'
    dConfig['mysql-port'] = 54321 
    dConfig['mysql'] = True
    dConfig['mysql-bulk-statement-size'] = 100

    dConfig['queueUpFilesForBuilding'] = False
    dConfig['queueSite'] = ''

    dConfig['redis-queue-to-build'] = 'muse-to-build'
    dConfig['redis-queue-building'] = 'muse-building'
    # dConfig['redis-queue-build-targets'] = 'muse-build-targets'
    dConfig['redis-queue-project-paths'] = 'muse-project-paths'
    dConfig['redis-queue-source-targets'] = 'muse-source-targets'
    dConfig['redis-loc'] = 'muse2-int'
    dConfig['redis-port'] = '12345'
    dConfig['redis'] = True

    dConfig['time-stamp'] = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
    dConfig['unBuiltProjectsOnly'] = False
    dConfig['version'] = '1.0'

    dConfig['build-targets'] = {
        'configure' : { 'type' : 'configureBuildType', 'ranking': 4 },
        'configure.ac' : { 'type' : 'configureacBuildType', 'ranking': 2 },
        'configure.in' : { 'type' : 'configureinBuildType', 'ranking': 3 },
        'CMakeLists.txt' : { 'type' : 'cmakeBuildType', 'ranking': 1 },
        'Makefile' : { 'type' : 'makefileBuildType', 'ranking': 5 }
        #'build.xml' : { 'type' : 'antBuildType',  'ranking': 7 },
        #'pom.xml' : { 'type' : 'mavenBuildType', 'ranking': 6 }
    }

    dConfig['source-targets'] = {
        '.c' : 'cBuildType',
        '.cc' : 'cppBuildType',
        '.cpp' : 'cppBuildType',
        '.cxx' : 'cppBuildType',
        '.c++' : 'cppBuildType'
    }

    ### command line argument handling
    options, remainder = getopt.getopt(sys.argv[1:], 'c:f:apuqs:d', ['corpus-dir-path=','forks=','analyze-projects','crawl-projects','unbuilt-projects-only','queue-projects','queue-site=','debug'])

    debug('func: main()', 'options:', options)
    debug('func: main()', 'remainder:', remainder)

    for opt, arg in options:

        if opt in ('-c', '--corpus-dir-path'):

            sCorpusPath = arg

        elif opt in ('-f', '--forks'):

            dConfig['forks'] = arg

        elif opt in ('-a', '--analyze-projects'):

            dConfig['analyze-projects'] = True

        elif opt in ('-p', '--crawl-projects'):

            dConfig['crawl-projects'] = True

        elif opt in ('-q', '--queue-projects'):

            dConfig['queueUpFilesForBuilding'] = True

        elif opt in ('-s', '--queue-site'):

            dConfig['queueSite'] = arg

        elif opt in ('-u', '--unbuilt-projects-only'):

            dConfig['unBuiltProjectsOnly'] = True

        elif opt in ('-d', '--debug'):

            dConfig['debug'] = True

    # debug(json.dumps(dConfig, indent=4))

    if dConfig['crawl-projects'] and not os.path.isdir(sCorpusPath): bError = True

    if bError: usage()
    else:

        iStart = time.time()

        ### setup producers

        if dConfig['crawl-projects']:

            # initialize projects table/queue
            initProjects(dConfig)

            # call producer process that populates mysql with project names from sCorpusPath 
            pfindProjects = multiprocessing.Process( target=findProjects, args=(sCorpusPath, dConfig) )
            pfindProjects.start()

            # create pool of workers
            oProcessProjectsPool = multiprocessing.Pool(processes=dConfig['forks'])

            lArgs = []

            for i in range(0, dConfig['forks']):

                lArgs.append(dConfig)

            ### do work -- use pool of workers to index source targets
            oProcessProjectsPool.map(processProjects, lArgs)

            pfindProjects.join()

            oProcessProjectsPool.close()
            oProcessProjectsPool.join()

        elif dConfig['analyze-projects']:

            # initialize targets table/queue
            initTargets(dConfig)

            pBuildTargets = multiprocessing.Process( target=findBuildTargets, args=(dConfig, ) )
            pBuildTargets.start()
        
            pBuildTargets.join()

            pSourceTargets = multiprocessing.Process( target=findSourceTargets, args=(dConfig, ) )
            pSourceTargets.start()

            # create pool of workers
            oSourceTargetIndexerPool = multiprocessing.Pool(processes=dConfig['forks'])

            lArgs = []

            for i in range(0, dConfig['forks']):

                lArgs.append(dConfig)

            ### do work -- use pool of workers to index source targets
            oSourceTargetIndexerPool.map(indexSourceTargets, lArgs)

            pSourceTargets.join()

            oSourceTargetIndexerPool.close()
            oSourceTargetIndexerPool.join()

        elif dConfig['queueUpFilesForBuilding']:

            initBuildQueues(dConfig=dConfig)
            queueUpBuildTargets(dConfig=dConfig)
            queueUpSourceTargets(dConfig=dConfig)

        if dConfig['debug']: debug('func: main()', "all processes completed") 

        iEnd = time.time()

        printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
def findBuildTargets(dConfig):

    # setup mysql client
    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
    dMp.open()

    # setup elasticsearch client
    oES = Elasticsearch(dConfig['es-instance-locs'],timeout=180, max_retries=3, retry_on_timeout=True )

    # purge build targets queue -- considering if we need to split mysql ingestion from elasticsearch queries... mysql may benefit from consumer pool inserting statements concurrently
    # qRedis = RedisQueue(dConfig['redis-queue-build-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    lBuildFiles = []

    iCtr = 0

    dQuery = {
        "query": {
            "bool": {
                "must": [
                    { "bool": {
                        "should": [
                            { "wildcard": { "file.raw": "*/configure.ac" } },
                            { "wildcard": { "file.raw": "*/configure.in" } },
                            { "wildcard": { "file.raw": "*/configure" } },
                            { "wildcard": { "file.raw": "*/CMakeLists.txt" } },
                            { "wildcard": { "file.raw": "*/Makefile" } }
                        ]
                      }
                    },
                    {
                      "bool": {
                        "should": [
                          { "match": { "path": "latest/*" } },
                          { "match": { "path": "content/*"} }
                        ]
                      }
                    },
                    {"wildcard":{"file.raw": "/data/corpus_8tof/*"}}
                ]
            }
        }
    }

    if dConfig['debug']: debug( 'func: findBuildFiles() dQuery:', json.dumps(dQuery) ) 

    # scroll time set to 10 minutes, change as needed -- required for consistent results, the scroll token expires at the end of scroll time

    dResponse = oES.search(index=dConfig['es-file-index-name'], doc_type=dConfig['es-file-index-type'], body=json.dumps(dQuery), search_type='scan', scroll='20m', timeout='20m', lowercase_expanded_terms=False, request_timeout=180,)
    sScrollId = dResponse['_scroll_id']

    if dConfig['debug']: debug('func: findBuildFiles() (after initial search) dResponse: ', dResponse)

    if dConfig['debug']: debug('func: findBuildFiles() search hits: ', dResponse['hits']['total'])
    debug('func: findBuildFiles() search hits: ', dResponse['hits']['total'])

    while 'timed_out' in dResponse and not dResponse['timed_out'] and 'hits' in dResponse and 'total' in dResponse['hits'] and dResponse['hits']['total'] > 0:

        dResponse = oES.scroll(scroll_id=sScrollId, scroll='20m')

        sScrollId = dResponse['_scroll_id']

        if ('hits' in dResponse['hits']) and (len(dResponse['hits']['hits']) > 0):

            if dConfig['debug']: debug('func: findBuildFiles() scroll_id:', sScrollId, 'number of hits:', len(dResponse['hits']['hits']) )

            if dConfig['debug'] and iCtr > 10: break

            for dHit in dResponse['hits']['hits']:

                iCtr += 1

                if dConfig['debug'] and iCtr > 10: break

                # found matches

                try:

                    if '_source' in dHit:

			#NATE added, remove leading path from found built targets
                        mBuildTarget=dHit['_source']['file'];
                        mBuildTarget=mBuildTarget.split('/')
                        dHit['_source']['file'] = mBuildTarget[len(mBuildTarget)-1]

                        dProjectFound = {}

                        # initialize all build target types to false
                        lBuildTypes = dMp.getBuildTypes()
                        for sBuildType in lBuildTypes:

                            dProjectFound[sBuildType] = False

                        # mark relevant build target type true
                        if 'file' in dHit['_source'] and dHit['_source']['file'] and dHit['_source']['file'] in dConfig['build-targets'].keys():

                            if dConfig['debug']: debug('func findBuildFiles() returned build target:', dHit['_source']['file'])

                            dProjectFound[ dConfig['build-targets'][ dHit['_source']['file'] ]['type'] ] = True
                            dProjectFound['ranking'] = dConfig['build-targets'][ dHit['_source']['file'] ]['ranking']

                        else: 

                            warning( 'func findBuildFiles() es returned an improper build target:', json.dumps(dHit['_source']) )
                            continue

                        if 'project-name' in dHit['_source'] and dHit['_source']['project-name']: dProjectFound['projectName'] = dHit['_source']['project-name']
                        if 'project-path' in dHit['_source'] and dHit['_source']['project-path']: dProjectFound['projectPath'] = dHit['_source']['project-path']
                        if 'path' in dHit['_source'] and dHit['_source']['path']: 

                            dProjectFound['buildTargetPath'] = verifyEncoding( dHit['_source']['path'] )
                            dProjectFound['depth'] = depth( dProjectFound['buildTargetPath'] )

                        # debug('func findBuildFiles()', json.dumps(dProjectFound))

                        lBuildFiles.append(dProjectFound)

                        # causing es reads to time out
    
                        if (len(lBuildFiles) > dConfig['mysql-bulk-statement-size']) and dConfig['mysql']:

                            dMp.insertIntoBuildTargets(lTargets=lBuildFiles, bDebug=dConfig['debug'])
                            printMsg('func findBuildFiles() loaded', iCtr, 'build targets')

                            lBuildFiles = []

                except (UnicodeDecodeError, UnicodeEncodeError) as e:
                    
                    warning('func findBuildFiles() encountered exception:', e)
                    #warning('func findBuildFiles() with string: ', dHit['_source']['path'])
                    warning('func findBuildFiles() full _source payload: ', json.dumps( dHit['_source'], indent=4 ) )

        else:

            break

        if (len(lBuildFiles) > 0) and dConfig['mysql']:

            dMp.insertIntoBuildTargets(lTargets=lBuildFiles, bDebug=dConfig['debug'])
                
            lBuildFiles = []

    dMp.close()