コード例 #1
0
def verifyEncoding(sOriginal):

    sTransformed = ''

    try:

        sTransformed = sOriginal.encode('utf-8')
        #sTransformed = sOriginal.decode('utf-8')

    except (ValueError, UnicodeDecodeError, UnicodeEncodeError) as e:

        try:

            sTransformed = sOriginal.encode('latin-1')
            #sTransformed = sOriginal.decode('latin-1')

        except (ValueError, UnicodeDecodeError, UnicodeEncodeError) as e:

            try:

                sTransformed = sOriginal.encode('utf-16')
                #sTransformed = sOriginal.decode('utf-16')

            except (ValueError, UnicodeDecodeError, UnicodeEncodeError) as e:

                warning('func verifyEncoding(): failed to transform sOriginal:', sOriginal, 'with utf-8, latin-1 and utf-16', e)
                sTransformed = ''

    return sTransformed
コード例 #2
0
def verifyEncoding(sOriginal, bDecode=True, bEncode=False):

    sTransformed = ''

    try:

        if bEncode: sTransformed = sOriginal.encode('utf-8')
        elif bDecode: sTransformed = sOriginal.decode('utf-8')

    except (ValueError, UnicodeDecodeError, UnicodeEncodeError) as e:

        try:

            if bEncode: sTransformed = sOriginal.encode('latin-1')
            elif bDecode: sTransformed = sOriginal.decode('latin-1')

        except (ValueError, UnicodeDecodeError, UnicodeEncodeError) as e:

            try:

                if bEncode: sTransformed = sOriginal.encode('utf-16')
                elif bDecode: sTransformed = sOriginal.decode('utf-16')

            except (ValueError, UnicodeDecodeError, UnicodeEncodeError) as e:

                warning(
                    'func verifyEncoding(): failed to transform sOriginal:',
                    sOriginal, 'with utf-8, latin-1 and utf-16', e)
                traceback.print_exc()
                sTransformed = ''

    return sTransformed
コード例 #3
0
def changePerms(tTup):

    (sProjectPath, dConfig) = tTup

    sLatestDir = os.path.join(sProjectPath, 'latest')

    if os.path.exists(sLatestDir):

        if os.path.isdir(sLatestDir):

            # project-path/latest exists as a directory

            if dConfig['debug']:
                debug('func: changePerms()',
                      'changing directory permissions on', sLatestDir)

            # change directory permissions to 555
            os.system('find ' + sLatestDir +
                      ' -type d -exec chmod 555 \'{}\' \;')

            if dConfig['debug']:
                debug('func: changePerms()', 'changing file permissions on',
                      sLatestDir)

            # change file permissions to 4444
            os.system('find ' + sLatestDir +
                      ' -type f -exec chmod 444 \'{}\' \;')

        else:

            warning(
                'func changePerms() latest exists but is not a directory under path:',
                sLatestDir)

    else:

        warning('func changePerms() latest does not exist under path:',
                sProjectPath, 'at', sLatestDir)
コード例 #4
0
def usage():
    warning(
        'Usage: changeCorpusLatestPerms.py --corpus-dir-path=/data/corpus --forks=5 --redis --debug'
    )
    warning(
        'Usage: Please note that above directory arguments are defaults if not supplied and both directories must exist on the filesystem.'
    )
    warning(
        'Usage: if mode is supplied, it must be either set to thread or process. thread is the default.'
    )
コード例 #5
0
def findProjectFiles(dConfig):

    qRedis = RedisQueue(dConfig['redis-queue-name'],
                        namespace='queue',
                        host=dConfig['redis-loc'])
    oES = Elasticsearch(dConfig['es-instance-locs'])

    lIgnoreDirs = ['.git', '.svn']

    dProject = {}
    dSource = {}

    dProject['_index'] = dConfig['es-index-name']
    dProject['_type'] = dConfig['es-index-type']

    dSource['crawl-time'] = dConfig['time-stamp']

    dSource['project-path'] = qRedis.get(block=True)

    lProjectFiles = []

    # if project path is '**done**', then break
    while dSource['project-path'] != '**done**':

        dSource['project-name'] = os.path.basename(dSource['project-path'])

        if dConfig['debug']:
            debug('func: findProjectFiles()', 'project-path:',
                  dSource['project-path'], dSource['project-name'])

        for sRoot, lDirs, lFiles in os.walk(dSource['project-path']):

            if len(lProjectFiles) > dConfig['es-bulk-chunk-size']:

                # ingest chunk into elasticsearch
                helpers.bulk(oES, lProjectFiles)

                if dConfig['debug']:
                    debug('func: findProjectFiles()', str(len(lProjectFiles)),
                          'files loaded into elasticsearch')

                lProjectFiles = []

            for sFile in lFiles:

                sFilePath = os.path.join(sRoot, sFile)

                sRelPath = os.path.relpath(sFilePath, dSource['project-path'])

                dFile = {}

                try:

                    sRelPath.decode('utf-8')

                except (ValueError, UnicodeDecodeError) as e:

                    try:

                        sRelPath.decode('latin-1')

                    except (ValueError, UnicodeDecodeError) as e:

                        try:

                            sRelPath.decode('utf-16')

                        except (ValueError, UnicodeDecodeError) as e:

                            warning(
                                'func findProjectFiles():', 'sProjectPath:',
                                dSource['project-path'], 'sProjectName:',
                                dSource['project-name'], 'sFile:', sFile,
                                'sRelPath:', sRelPath,
                                'utf-8, latin-1, and utf-16 decoding failed',
                                'exception:', e)

                        else:

                            dSource['file'] = sFile.decode('utf-16')
                            dSource['path'] = sRelPath.decode('utf-16')
                            dProject['_source'] = dSource

                            lProjectFiles.append(dProject)

                    else:

                        dSource['file'] = sFile.decode('latin-1')
                        dSource['path'] = sRelPath.decode('latin-1')
                        dProject['_source'] = dSource

                        lProjectFiles.append(dProject)

                else:

                    dSource['file'] = sFile
                    dSource['path'] = sRelPath
                    dProject['_source'] = dSource

                    lProjectFiles.append(dProject)

            lDirs[:] = [sDir for sDir in lDirs if sDir not in lIgnoreDirs]

        # get next project to process
        dSource['project-path'] = qRedis.get(block=True)

    # index any remaining projects
    if len(lProjectFiles) > 0:

        # ingest chunk into elasticsearch
        helpers.bulk(oES, lProjectFiles)

        if dConfig['debug']:
            debug('func: findProjectFiles()', str(len(lProjectFiles)),
                  'files loaded into elasticsearch')
コード例 #6
0
def findProjectFiles(tTup):

    (sProjectPath, oES, dConfig) = tTup
    sProjectName = os.path.basename(sProjectPath)

    oES = Elasticsearch(dConfig['es-instance-locs'])

    lIgnoreDirs = ['.git', '.svn']

    lProjectFiles = []

    if dConfig['debug']:
        debug('func: findProjectFiles()', 'project-path:', sProjectPath,
              'project-name:', sProjectName)

    for sRoot, lDirs, lFiles in os.walk(sProjectPath):

        if len(lProjectFiles) > dConfig['es-bulk-chunk-size']:

            # ingest chunk into elasticsearch
            (iSuccess, lResponse) = helpers.bulk(client=oES,
                                                 actions=lProjectFiles,
                                                 timeout="20m",
                                                 request_timeout=120.)

            if iSuccess < dConfig['es-bulk-chunk-size']:
                warning('func: findProjectFiles() iSuccess:', iSuccess,
                        ' expected:', dConfig['es-bulk-chunk-size'])
                warning('func: findProjectFiles()', type(lResponse),
                        'returned by bulk api')
                warning('func: findProjectFiles()',
                        json.dumps(lResponse, indent=4),
                        'returned by bulk api')

            #del lProjectFiles[0 : len(lProjectFiles)]
            lProjectFiles = []

            if dConfig['debug']:
                debug('func: findProjectFiles()', str(len(lProjectFiles)),
                      'files loaded into elasticsearch')

        for sFile in lFiles:

            # make sure dProject is emptied each loop iteration
            dProject = {
                '_index': dConfig['es-index-name'],
                '_type': dConfig['es-index-type'],
                '_source': {
                    'project-path': sProjectPath,
                    'project-name': sProjectName,
                    'crawl-time': dConfig['time-stamp']
                }
            }

            sFilePath = os.path.join(sRoot, sFile)
            sRelPath = os.path.relpath(sFilePath, sProjectPath)

            sDecodedFile = ''
            sDecodedRelPath = ''
            sEncodedWith = ''

            # Look for the tar file with the src code
            if "_code.tgz" in sFilePath:
                global counter
                counter = counter + 1
                print(str(counter) + ': working on: ' + sFilePath)

                t = tarfile.open(sFilePath, 'r:*')

                # Iterate over the files in the tar file gz
                for tarinfo in t:
                    if tarinfo.isfile():
                        filename = tarinfo.name
                        if (".svn" not in filename and ".git" not in filename):

                            # make sure dProject is emptied each loop iteration
                            dProject = {
                                '_index': dConfig['es-index-name'],
                                '_type': dConfig['es-index-type'],
                                '_source': {
                                    'project-path': sProjectPath,
                                    'project-name': sProjectName,
                                    'crawl-time': dConfig['time-stamp']
                                }
                            }

                            # append file in tar to tar path
                            sFile = os.path.join(sFilePath, filename)
                            sRelPath = os.path.relpath(sFile, sProjectPath)

                            sDecodedFile = ''
                            sDecodedRelPath = ''
                            sEncodedWith = ''
                            try:

                                sDecodedFile = sFile.decode('utf-8')
                                sDecodedRelPath = sRelPath.decode('utf-8')
                                sEncodedWith = 'utf-8'

                            except (ValueError, UnicodeDecodeError) as e:

                                try:

                                    sDecodedFile = sFile.decode('latin-1')
                                    sDecodedRelPath = sRelPath.decode(
                                        'latin-1')
                                    sEncodedWith = 'latin-1'

                                except (ValueError, UnicodeDecodeError) as e:

                                    try:

                                        sDecodedFile = sFile.decode('utf-16')
                                        sDecodedRelPath = sRelPath.decode(
                                            'utf-16')
                                        sEncodedWith = 'utf-16'

                                    except (ValueError,
                                            UnicodeDecodeError) as e:

                                        warning(
                                            'func findProjectFiles():',
                                            'sProjectPath:',
                                            dProject['_source']
                                            ['project-path'], 'sProjectName:',
                                            dProject['_source']
                                            ['project-name'], 'sFile:', sFile,
                                            'sRelPath:', sRelPath,
                                            'utf-8, latin-1, and utf-16 decoding failed',
                                            'exception:', e)
                                        print("decode failed")
                                        sDecodedFile = ''
                                        sDecodedRelPath = ''
                                        sEncodedWith = ''

                            if sDecodedFile and sDecodedRelPath:
                                dProject['_source']['file'] = sDecodedFile
                                (_, sFileExt) = os.path.splitext(sDecodedFile)
                                if sFileExt:
                                    dProject['_source']['ext'] = sFileExt[
                                        1:].lower()
                                dProject['_source']['path'] = sDecodedRelPath

                                if dConfig['debug']:
                                    debug('func: findProjectFiles() dProject:',
                                          dProject, 'encoded with',
                                          sEncodedWith)

                                lProjectFiles.append(dProject)

        lDirs[:] = [sDir for sDir in lDirs if sDir not in lIgnoreDirs]

    # ingest any stragglers remaining into elasticsearch
    (iSuccess, lResponse) = helpers.bulk(client=oES,
                                         actions=lProjectFiles,
                                         timeout="20m",
                                         request_timeout=120.)

    if iSuccess < len(lProjectFiles):
        warning('func: findProjectFiles() iSuccess:', iSuccess, ' expected:',
                len(lProjectFiles))
        warning('func: findProjectFiles()', type(lResponse),
                'returned by bulk api')
        warning('func: findProjectFiles()', json.dumps(lResponse, indent=4),
                'returned by bulk api')

    # del lProjectFiles[0 : len(lProjectFiles)]
    lProjectFiles = []
コード例 #7
0
def usage():

    warning(
        'Usage: buildProjectsByType.py --queue-projects=\"configure.ac\" --os=\"ubuntu14" --rebuild --debug'
    )
コード例 #8
0
def usage():
    warning('Usage: natesProjects.py --debug')
コード例 #9
0
def parseBuildOutput(dArgs, bDebug=False):

    dFiles = {
        'buildTime': 'runtime.log',
        'numObjectsPreBuild': 'numObjectsPreBuild.log',
        'numObjectsPostBuild': 'numObjectsPostBuild.log',
        'numObjectsGenerated': 'numObjectsGenerated.log',
        'numSources': 'numSources.log'
    }

    dBuffer = {}

    for sFileType, sFileName in dFiles.iteritems():

        sFileName = os.path.join(dArgs['dirs']['output'], sFileName)

        if os.path.isfile(sFileName):

            with open(sFileName, 'r') as fBuilderFile:

                # get file input and trim unnecessary whitespace before/after
                dBuffer[sFileType] = (fBuilderFile.read()).strip()

        else:

            dBuffer[sFileType] = ''
            warning('func: parseBuildOutput() sFileType: ', sFileType,
                    ' missing for project:', dArgs['projectName'],
                    'container:', dArgs['containerName'])

    dBuffer['targets'] = []

    dTargetSpecificFiles = {
        'returnCode': 'retcode.log'  #, 
        #'stdout' : 'stdout.log',
        #'stderr' : 'stderr.log'
    }

    iCtr = 0
    lRetCodes = []

    for dTarget in dArgs['targets']:

        for sFileType, sFileName in dTargetSpecificFiles.iteritems():

            sFileName = os.path.join(dArgs['dirs']['output'],
                                     sFileName + '.' + str(iCtr))

            if os.path.isfile(sFileName):

                dTarget[sFileType] = ''

                with open(sFileName, 'r') as fBuilderFile:

                    # get file input and trim unnecessary whitespace before/after
                    dTarget[sFileType] = (fBuilderFile.read()).strip()
                    '''
                    for sLine in fBuilderFile:
    
                        dTarget[sFileType] += verifyEncoding(sLine) + '\n'
                    '''

        if 'returnCode' in dTarget and isInt(dTarget['returnCode']):

            lRetCodes.append(int(dTarget['returnCode'].strip()))

        else:

            warning(
                'func: parseBuildOutput() invalid return code encountered:',
                json.dumps(dTarget, indent=4), 'project:',
                dArgs['projectName'], 'container:', dArgs['containerName'])
            dTarget['returnCode'] = 666
            lRetCodes.append(666)

        dBuffer['targets'].append(dTarget)

        iCtr += 1

    if len(lRetCodes) > 0:
        dBuffer['returnCode'] = str(max(lRetCodes))
    else:
        dBuffer['returnCode'] = '666'

    if bDebug:

        debug('func: parseBuildOutput() dBuffer:', json.dumps(dBuffer,
                                                              indent=4))

    return dBuffer
コード例 #10
0
def processBuildTargets(tTup):

    try:

        (iContainerId, dArgs, dConfig) = tTup

        # dual queues -- primary for getting what project to build next, secondary to mark what is being built
        qRedis = RedisQueue(name=dConfig['redis-queue-to-build'],
                            name2=dConfig['redis-queue-building'],
                            namespace='queue',
                            host=dConfig['redis-loc'],
                            port=dConfig['redis-port'])

        # set of existing builds for this os container used to prune out projects already built with this container
        sExistingBuilds = RedisSet(name=dConfig['redis-already-built-nate'],
                                   namespace='set',
                                   host=dConfig['redis-loc'],
                                   port=dConfig['redis-port'])

        debug('func: processBuildTargets(), has ' + str(len(sExistingBuilds)) +
              ' built projects')
        iCtr = 0

        while 1:

            sBuildTarget = qRedis.getnpush(block=True, timeout=30)
            #sBuildTarget = qRedis.peek()

            # debug(sBuildTarget)

            if sBuildTarget:

                if dConfig['debug']:
                    debug('func: processBuildTargets() sBuildTarget:',
                          sBuildTarget)

                dBuildTarget = json.loads(sBuildTarget)

                # initial setup
                #if 'projectName' not in dBuiltTarget: continue

                dArgs['projectName'] = dBuildTarget['projectName']

                if dArgs['projectName'] in sExistingBuilds:

                    warning('func: processBuildTargets() project:',
                            dArgs['projectName'],
                            ' already built... skipping...')
                    continue

                #sProjectPath = os.path.relpath(dBuildTarget['projectPath'], '/data/corpus')
                #sProjectPath = os.path.join('/nfsbuild/nfsbuild', sProjectPath)

                #dArgs['buildPath'] = sProjectPath
                dArgs['targets'] = dBuildTarget['targets']

                if dConfig['debug']:
                    debug('func: processBuildTargets() targets:',
                          json.dumps(dArgs['targets'], indent=4))

                dArgs['containerId'] = str(iContainerId)
                dArgs[
                    'containerName'] = dConfig['containerImage'] + '-' + dArgs[
                        'containerOS'] + '-' + dConfig['hostname'] + '_' + str(
                            iContainerId)

                dArgs['dirs'] = {}
                dArgs['dirs']['root'] = os.path.join(dConfig['containerPath'],
                                                     dArgs['containerName'])

                for sDir in dArgs['containerDirs']:

                    dArgs['dirs'][sDir] = os.path.join(dArgs['dirs']['root'],
                                                       sDir)

                # /data/corpus on muse2 is mounted under /nfscorpus/nfscorpus on all 3 servers (via mount-bind on muse2 and NFS on muse1 and muse3)
                debug('projectPath: ', dBuildTarget['projectPath'])
                if "_8tof" in dBuildTarget['projectPath']:
                    sProjectPath = os.path.relpath(dBuildTarget['projectPath'],
                                                   '/data/corpus_8tof')
                    sBuildPath = os.path.join('/data/builder_SAN/outputCyber',
                                              sProjectPath)
                    sProjectPath = os.path.join('/data/corpus_8tof',
                                                sProjectPath)
                if "_0to7" in dBuildTarget['projectPath']:
                    sProjectPath = os.path.relpath(dBuildTarget['projectPath'],
                                                   '/data/corpus_0to7')
                    sBuildPath = os.path.join('/data/builder_SAN/outputCyber',
                                              sProjectPath)
                    sProjectPath = os.path.join('/data/corpus_0to7',
                                                sProjectPath)
                debug('projectPathDone: ', sProjectPath)

                dArgs['buildPath'] = sBuildPath
                '''
                # determine code root in project directory
                sCodePath = dBuildTarget['buildTargetPath']
                if sCodePath.startswith('./'):
                    sCodePath = dBuildTarget['buildTargetPath'][2:]
                sCodeRoot = sCodePath[:sCodePath.index(os.sep)] if os.sep in sCodePath else sCodePath
                '''
                plist = sProjectPath.split('/')
                uuid = plist[len(plist) - 1]
                tar = uuid + ("_code.tgz")
                debug('tarball: ', tar)

                dArgs['projectPath'] = os.path.join(sProjectPath, tar)

                # add code root to project path
                #                if dBuildTarget['codeDir']:
                #                    print('none')
                #dArgs['projectPath'] = os.path.join(sProjectPath, dBuildTarget['codeDir'])

                #                else:

                #                   warning('func: processBuildTargets() encountered project:', dBuildTarget['projectName'], ' with empty or NULL codeDir which is not supported. Project build skipped...')
                #                    continue

                sTimeStamp = datetime.datetime.now().strftime('%Y%m%dT%H%M%S')
                dArgs['jsonName'] = 'build-' + sTimeStamp + '.json'
                dArgs['tarName'] = dArgs[
                    'projectName'] + '-' + sTimeStamp + '.tgz'
                dArgs['version'] = dBuildTarget['version']

                # setup container
                makeDirs(dArgs=dArgs, bDebug=dConfig['debug'])
                copySource(dArgs=dArgs, bDebug=dConfig['debug'])
                copyScripts(dArgs=dArgs, bDebug=dConfig['debug'])
                createBuildPlanScript(dArgs=dArgs, bDebug=dConfig['debug'])
                recordProjectName(dArgs=dArgs, bDebug=dConfig['debug'])
                startBuild(dArgs=dArgs, bDebug=dConfig['debug'])

                # sleep until build completes
                while pollBuild(dArgs=dArgs, bDebug=dConfig['debug']):

                    if dConfig['debug']:
                        debug(
                            'func: processBuildTargets() build not completed... sleeping'
                        )
                    time.sleep(10)

                # get container logs
                getBuildLogs(dArgs=dArgs, bDebug=dConfig['debug'])

                # get build output
                dBuffer = parseBuildOutput(dArgs=dArgs,
                                           bDebug=dConfig['debug'])

                # index build output
                postBuildStatusUpdates(dArgs=dArgs,
                                       dBuffer=dBuffer,
                                       dConfig=dConfig)

                # archive build artifacts
                tarUpContainerDirs(dArgs=dArgs, bDebug=dConfig['debug'])

                # remove container
                removeContainer(dArgs=dArgs, bDebug=dConfig['debug'])

                # remove project from "building" queue
                # qRedis.done(value=sBuildTarget)

                iCtr += 1

                if dConfig['debug'] and iCtr >= 1:

                    break

            else:

                break

        if dConfig['debug']:

            debug(
                'func: processBuildTargets() sBuildTarget is either empty or none, likely since the redis queue is empty'
            )
            debug('func: processBuildTargets() redis queue size:',
                  qRedis.size())
            debug('func: processBuildTargets() exiting...')

    except Exception as e:

        warning('Caught exception in worker thread:', iContainerId)
        traceback.print_exc()
        raise e
コード例 #11
0
def usage():
    warning('Usage: createJsonReports.py --forks=5 --debug')
コード例 #12
0
def indexSourceTargets(dConfig):

    # setup mysql client
    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
    dMp.open()

    # setup elasticsearch client
    oES = Elasticsearch(dConfig['es-instance-locs'])

    # setup source targets queue
    qRedis = RedisQueue(dConfig['redis-queue-source-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    while 1:

        sQuery = qRedis.get(block=True, timeout=30)

        if sQuery:

            dQuery = json.loads(sQuery)

            if dConfig['debug']: debug( 'func: indexSourceTargets() dQuery:', json.dumps(dQuery) ) 

            lSourceFiles = []

            # scroll time set to 10 minutes, change as needed -- required for consistent results, the scroll token expires at the end of scroll time

            dResponse = oES.search(index=dConfig['es-file-index-name'], doc_type=dConfig['es-file-index-type'], body=json.dumps(dQuery), search_type='scan', scroll='20m', timeout='20m', lowercase_expanded_terms=False)
            sScrollId = dResponse['_scroll_id']

            if dConfig['debug']: debug('func: indexSourceTargets() (after initial search) dResponse: ', dResponse)

            if dConfig['debug']: debug('func: indexSourceTargets() search hits: ', dResponse['hits']['total'])

            #while not dResponse['timed_out'] and dResponse['hits']['hits']['total'] > 0:
            while 'timed_out' in dResponse and not dResponse['timed_out'] and 'hits' in dResponse and 'total' in dResponse['hits'] and dResponse['hits']['total'] > 0:

                dResponse = oES.scroll(scroll_id=sScrollId, scroll='20m')

                sScrollId = dResponse['_scroll_id']

                if ('hits' in dResponse['hits']) and (len(dResponse['hits']['hits']) > 0):

                    if dConfig['debug']: debug('func: indexSourceTargets() scroll_id:', sScrollId, 'number of hits:', len(dResponse['hits']['hits']) )

                    for dHit in dResponse['hits']['hits']:

                        # found matches

                        try:

                            if '_source' in dHit:

                                # debug('func: indexSourceTargets() dHit:', json.dumps(dHit['_source']) )
                                #NATE added, remove leading path from found built targets
                                mBuildTarget=dHit['_source']['file'];
                                mBuildTarget=mBuildTarget.split('/')
                                dHit['_source']['file'] = mBuildTarget[len(mBuildTarget)-1]

                                dProjectFound = {}

                                lSourceTypes = dMp.getSourceTypes()
                                for sSourceType in lSourceTypes:

                                    dProjectFound[sSourceType] = False

                                if 'file' in dHit['_source'] and dHit['_source']['file']:

                                    (sFileName, sFileExt) = os.path.splitext(dHit['_source']['file']) 

                                    if sFileExt.lower() in dConfig['source-targets'].keys():

                                        dProjectFound[ dConfig['source-targets'][ sFileExt.lower() ] ] = True

                                else: 

                                    warning( 'func indexSourceTargets() es returned an improper source target:', json.dumps(dHit['_source']) )
                                    continue

                                if 'project-name' in dHit['_source'] and dHit['_source']['project-name']: dProjectFound['projectName'] = dHit['_source']['project-name']
                                if 'project-path' in dHit['_source'] and dHit['_source']['project-path']: dProjectFound['projectPath'] = dHit['_source']['project-path']
                                if 'path' in dHit['_source'] and dHit['_source']['path']: 

                                    dProjectFound['buildTargetPath'] = verifyEncoding( dHit['_source']['path'] )

                                # debug('func findSourceFileHelper()', json.dumps(dProjectFound))

                                lSourceFiles.append(dProjectFound)

                                # causing es reads to time out
            
                                if (len(lSourceFiles) > dConfig['mysql-bulk-statement-size']) and dConfig['mysql']:

                                    dMp.insertIntoSourceTargets(lTargets=lSourceFiles, bDebug=dConfig['debug'])
                                    printMsg('func indexSourceTargets() loaded', iCtr, 'source targets')

                                    lSourceFiles = []

                        except (UnicodeDecodeError, UnicodeEncodeError) as e:
                            
                            warning('func indexSourceTargets() encountered exception:', e)
                            #warning('func indexSourceTargets() with string: ', dHit['_source']['path'])
                            warning('func indexSourceTargets() full _source payload: ', json.dumps( dHit['_source'], indent=4 ) )

                else:

                    break

                if (len(lSourceFiles) > 0) and dConfig['mysql']:

                    dMp.insertIntoSourceTargets(lTargets=lSourceFiles, bDebug=dConfig['debug'])
                        
                    lSourceFiles = []

        else:

            break

    dMp.close()
コード例 #13
0
def usage():
    warning('Usage: labelProjectsByBuildType.py --forks=5 --debug')
コード例 #14
0
def usage():
    warning('Usage: testLocks.py')
コード例 #15
0
def findBuildTargets(dConfig):

    # setup mysql client
    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
    dMp.open()

    # setup elasticsearch client
    oES = Elasticsearch(dConfig['es-instance-locs'],timeout=180, max_retries=3, retry_on_timeout=True )

    # purge build targets queue -- considering if we need to split mysql ingestion from elasticsearch queries... mysql may benefit from consumer pool inserting statements concurrently
    # qRedis = RedisQueue(dConfig['redis-queue-build-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    lBuildFiles = []

    iCtr = 0

    dQuery = {
        "query": {
            "bool": {
                "must": [
                    { "bool": {
                        "should": [
                            { "wildcard": { "file.raw": "*/configure.ac" } },
                            { "wildcard": { "file.raw": "*/configure.in" } },
                            { "wildcard": { "file.raw": "*/configure" } },
                            { "wildcard": { "file.raw": "*/CMakeLists.txt" } },
                            { "wildcard": { "file.raw": "*/Makefile" } }
                        ]
                      }
                    },
                    {
                      "bool": {
                        "should": [
                          { "match": { "path": "latest/*" } },
                          { "match": { "path": "content/*"} }
                        ]
                      }
                    },
                    {"wildcard":{"file.raw": "/data/corpus_8tof/*"}}
                ]
            }
        }
    }

    if dConfig['debug']: debug( 'func: findBuildFiles() dQuery:', json.dumps(dQuery) ) 

    # scroll time set to 10 minutes, change as needed -- required for consistent results, the scroll token expires at the end of scroll time

    dResponse = oES.search(index=dConfig['es-file-index-name'], doc_type=dConfig['es-file-index-type'], body=json.dumps(dQuery), search_type='scan', scroll='20m', timeout='20m', lowercase_expanded_terms=False, request_timeout=180,)
    sScrollId = dResponse['_scroll_id']

    if dConfig['debug']: debug('func: findBuildFiles() (after initial search) dResponse: ', dResponse)

    if dConfig['debug']: debug('func: findBuildFiles() search hits: ', dResponse['hits']['total'])
    debug('func: findBuildFiles() search hits: ', dResponse['hits']['total'])

    while 'timed_out' in dResponse and not dResponse['timed_out'] and 'hits' in dResponse and 'total' in dResponse['hits'] and dResponse['hits']['total'] > 0:

        dResponse = oES.scroll(scroll_id=sScrollId, scroll='20m')

        sScrollId = dResponse['_scroll_id']

        if ('hits' in dResponse['hits']) and (len(dResponse['hits']['hits']) > 0):

            if dConfig['debug']: debug('func: findBuildFiles() scroll_id:', sScrollId, 'number of hits:', len(dResponse['hits']['hits']) )

            if dConfig['debug'] and iCtr > 10: break

            for dHit in dResponse['hits']['hits']:

                iCtr += 1

                if dConfig['debug'] and iCtr > 10: break

                # found matches

                try:

                    if '_source' in dHit:

			#NATE added, remove leading path from found built targets
                        mBuildTarget=dHit['_source']['file'];
                        mBuildTarget=mBuildTarget.split('/')
                        dHit['_source']['file'] = mBuildTarget[len(mBuildTarget)-1]

                        dProjectFound = {}

                        # initialize all build target types to false
                        lBuildTypes = dMp.getBuildTypes()
                        for sBuildType in lBuildTypes:

                            dProjectFound[sBuildType] = False

                        # mark relevant build target type true
                        if 'file' in dHit['_source'] and dHit['_source']['file'] and dHit['_source']['file'] in dConfig['build-targets'].keys():

                            if dConfig['debug']: debug('func findBuildFiles() returned build target:', dHit['_source']['file'])

                            dProjectFound[ dConfig['build-targets'][ dHit['_source']['file'] ]['type'] ] = True
                            dProjectFound['ranking'] = dConfig['build-targets'][ dHit['_source']['file'] ]['ranking']

                        else: 

                            warning( 'func findBuildFiles() es returned an improper build target:', json.dumps(dHit['_source']) )
                            continue

                        if 'project-name' in dHit['_source'] and dHit['_source']['project-name']: dProjectFound['projectName'] = dHit['_source']['project-name']
                        if 'project-path' in dHit['_source'] and dHit['_source']['project-path']: dProjectFound['projectPath'] = dHit['_source']['project-path']
                        if 'path' in dHit['_source'] and dHit['_source']['path']: 

                            dProjectFound['buildTargetPath'] = verifyEncoding( dHit['_source']['path'] )
                            dProjectFound['depth'] = depth( dProjectFound['buildTargetPath'] )

                        # debug('func findBuildFiles()', json.dumps(dProjectFound))

                        lBuildFiles.append(dProjectFound)

                        # causing es reads to time out
    
                        if (len(lBuildFiles) > dConfig['mysql-bulk-statement-size']) and dConfig['mysql']:

                            dMp.insertIntoBuildTargets(lTargets=lBuildFiles, bDebug=dConfig['debug'])
                            printMsg('func findBuildFiles() loaded', iCtr, 'build targets')

                            lBuildFiles = []

                except (UnicodeDecodeError, UnicodeEncodeError) as e:
                    
                    warning('func findBuildFiles() encountered exception:', e)
                    #warning('func findBuildFiles() with string: ', dHit['_source']['path'])
                    warning('func findBuildFiles() full _source payload: ', json.dumps( dHit['_source'], indent=4 ) )

        else:

            break

        if (len(lBuildFiles) > 0) and dConfig['mysql']:

            dMp.insertIntoBuildTargets(lTargets=lBuildFiles, bDebug=dConfig['debug'])
                
            lBuildFiles = []

    dMp.close()
コード例 #16
0
def queueUpSourceTargets(dConfig):

    if dConfig['mysql'] and dConfig['redis']:

        dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
        
        # setup to-build queue
        qRedis = RedisQueue(dConfig['redis-queue-to-build'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])
        
        dMp.open()

        # get projects first to iterate through (makes it easier to build project specific dictionaries), limit if in debug mode
        iProjectCount = 0
        iTargetCount = 0
        iMultiTargets = 0

        sLimitClause = ''

        if dConfig['debug']: sLimitClause = '10'
        
        lLeadingPaths = []
        
        dProject = {}

        dCodeDirLookup = {}
        lProjectRows = dMp.select(sSelectClause='projectName,codeDir', sTable='availableProjects', bDebug=dConfig['debug'])
        for tProjectRow in lProjectRows:

            (sProjectName, sCodeDir) = tProjectRow
            dCodeDirLookup[sProjectName] = sCodeDir
        
        lTargetRows = []

        if dConfig['unBuiltProjectsOnly']:

            if dConfig['queueSite']:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='unBuiltSourceTargetsWithSite', sWhereClause='site=\'' + dConfig['queueSite'] + '\'', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

            else:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='unBuiltSourceTargets', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

        else:

            if dConfig['queueSite']:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='availableSourceTargetsWithSite', sWhereClause='site=\'' + dConfig['queueSite'] + '\'', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

            else:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='availableSourceTargets', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

        dMp.close()

        for tTargetRow in lTargetRows:

            dTarget = {}

            (sProjectName, sProjectPath, dTarget['buildTargetPath'], ) = tTargetRow

            (_, sFileExt) = os.path.splitext( os.path.basename(dTarget['buildTargetPath']) )

            if sFileExt:

                sFileExt = sFileExt.lower()

                if sFileExt in dConfig['source-targets'].keys():

                    dTarget['buildType'] = dConfig['source-targets'][sFileExt]

                    (sLeadingPath, sTarget) = os.path.split(dTarget['buildTargetPath'])

                    # NATE remove leading tarball from path
                    sLeadingPath = re.sub(r'[a-zA-Z_0-9-_]*.tgz/', "", sLeadingPath)
                    dTarget['buildTargetPath'] = os.path.join(sLeadingPath, sTarget)

                    # NATE added to grab code directory from buildTargetPath
                    bPath=sLeadingPath.split('/')
                    if len(bPath) > 1 :
                        codedir2=bPath[0]

                    iTargetCount += 1

                    if 'projectName' in dProject :

                        if dProject['projectName'] != sProjectName:

                            # new project encountered, push old project onto queue
                            if dConfig['debug']: debug('func: queueUpSourceTargets() queuing project:', json.dumps(dProject, indent=4))
                            qRedis.put(json.dumps(dProject))
                            iProjectCount += 1
                            if len(lLeadingPaths) > 1:
                                iMultiTargets += 1

                            dProject = {
                                'projectName': sProjectName,
                                'projectPath': sProjectPath,
                                'version': dConfig['version'],
                                'targets': [ dTarget ],
                                'codeDir': codedir2
                                #'codeDir': dCodeDirLookup[sProjectName]
                            }

                            lLeadingPaths = [ sLeadingPath ]

                        else:

                            if sLeadingPath not in lLeadingPaths:

                                dProject['targets'].append(dTarget)
                                lLeadingPaths.append(sLeadingPath)

                            else: 

                                iTargetCount += -1
                                if dConfig['debug']: debug('func: queueUpSourceTargets() already encountered path:',  sLeadingPath, 'not adding:', json.dumps(dTarget, indent=4))

                    else:

                        dProject = {
                            'projectName': sProjectName,
                            'projectPath': sProjectPath,
                            'version': dConfig['version'],
                            'targets': [ dTarget ],
                            'codeDir': dCodeDirLookup[sProjectName]
                        }

                        lLeadingPaths = [ sLeadingPath ]

                else:

                    warning('func: queueUpSourceTargets() unknown C/C++ file extension encountered:', sFileExt, 'file-path:',dTarget['buildTargetPath'],'for project:', sProjectName)

            else:

                warning('func: queueUpSourceTargets() missing file extension encountered file-path:') #,dTarget['buildTargetPath'],'for project:', sProjectName)


        if dConfig['debug']: debug('func: queueUpSourceTargets() queuing project:', json.dumps(dProject, indent=4))

        qRedis.put(json.dumps(dProject))
        iProjectCount += 1        
        if len(lLeadingPaths) > 1:
            iMultiTargets += 1

        printMsg('func: queueUpSourceTargets()', str(iProjectCount), 'projects queued', str(iTargetCount), 'targets queued', str(iMultiTargets), 'multi-target projects queued')
        printMsg('func: queueUpSourceTargets()', qRedis.size(), 'projects reported by redis')
コード例 #17
0
def processProjects(dConfig):

    qRedis = RedisQueue(dConfig['redis-queue-project-paths'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
    dMp.open()

    lProjects = []

    iCount = 0

    while 1:

        sRoot = qRedis.get(block=True, timeout=30)

        if sRoot:

            dProject = {
                '_index': dConfig['es-project-index-name'],
                '_type': dConfig['es-project-index-type'],
                '_source': {}
            }

            dProject['_id'] = os.path.basename(sRoot)
            dProject['_source']['name'] = os.path.basename(sRoot)
            debug('func: processProjects() projects-root:', sRoot) 

            if dConfig['debug']: 

                debug('func: processProjects() projects-root:', sRoot) 
                debug('func: processProjects() projects _id and _source[name] :', dProject['_id']) 
                debug('func: processProjects() inserting project:', dProject['_source']['name'])

            if os.path.isfile( os.path.join(sRoot, 'filter.json') ):

                with open( os.path.join(sRoot, 'filter.json') ) as fProjectFilter:

                    dProjectFilter = json.load(fProjectFilter)

                    if 'hasBytecode' in dProjectFilter and dProjectFilter['hasBytecode'].lower() != 'none':
                        dProject['_source']['bytecode_available'] = True

            if os.path.isfile( os.path.join(sRoot, 'index.json') ):

                with open( os.path.join(sRoot, 'index.json') ) as fProjectIndex:
            
                    dProjectIndex = json.load(fProjectIndex)

                    if dConfig['debug']: debug('func: processProjects() dProjectIndex.keys():', json.dumps(dProjectIndex.keys(), indent=4) )

                    '''
                    if 'bytecode_available' in dProjectIndex and dProjectIndex['bytecode_available']:

                        dProject['_source']['bytecode_available'] = True
                    '''
                    if 'code' in dProjectIndex:

                        dProject['_source']['source'] = True
                        dProject['_source']['codeDir'] = dProjectIndex['code']

                        if dProject['_source']['codeDir'].startswith('./'):

                            dProject['_source']['codeDir'] = dProject['_source']['codeDir'][len('./'):]

                    if 'site' in dProjectIndex:

                        dProject['_source']['site'] = dProjectIndex['site']

                    if 'crawler_metadata' in dProjectIndex:

                        for sMetaDataFile in dProjectIndex['crawler_metadata']:

                            if 'languages.json' in sMetaDataFile:

                                sLanguageFile = os.path.join(sRoot, sMetaDataFile)

                                if os.path.isfile(sLanguageFile):

                                    with open(sLanguageFile) as fLanguageFile:

                                        dLanguageFile = json.load(fLanguageFile)

                                        if 'C' in dLanguageFile: 

                                            dProject['_source']['c'] = dLanguageFile['C']

                                        if 'C++' in dLanguageFile: 

                                            dProject['_source']['cpp'] = dLanguageFile['C++']

                                        if 'C#' in dLanguageFile:

                                            dProject['_source']['csharp'] = dLanguageFile['C#']

                                        if 'Java' in dLanguageFile: 

                                            dProject['_source']['java'] = dLanguageFile['Java']

                                        if dConfig['debug']: debug('func: findProjects() dLanguageFile:', json.dumps(dLanguageFile, indent=4) )
                                else:

                                    warning('func: processProjects()', 'languages.json file listed in index.json but does not exist for project:', dProject['_source']['name'], 'at listed location:', sLanguageFile)

            else:

                warning('func: processProjects()', 'index.json not found for project:', dProject['_source']['name'])

            lProjects.append(dProject)
            
            iCount += 1

            if (iCount % dConfig['mysql-bulk-statement-size']) == 0: 

                dMp.insertIntoProjects(lProjects=lProjects, bDebug=dConfig['debug'])
                lProjects = []

            if dConfig['debug'] and iCount >= 100: break

        else:

            break

    if dConfig['mysql']:

        if len(lProjects) > 0:

            dMp.insertIntoProjects(lProjects=lProjects, bDebug=dConfig['debug'])
            lProjects = []

        dMp.close()

    return lProjects
コード例 #18
0
def usage():
    warning('Usage: identifyMultipleBuildTypeProjects.py')
コード例 #19
0
def usage():
    warning('Usage: queueProjectsToBuildByType.py --corpus-dir-path=/data/corpus_0to7 --forks=5 --analyze-projects --crawl-projects --unbuilt-projects-only --queue-projects --debug')