def verifyEncoding(sOriginal): sTransformed = '' try: sTransformed = sOriginal.encode('utf-8') #sTransformed = sOriginal.decode('utf-8') except (ValueError, UnicodeDecodeError, UnicodeEncodeError) as e: try: sTransformed = sOriginal.encode('latin-1') #sTransformed = sOriginal.decode('latin-1') except (ValueError, UnicodeDecodeError, UnicodeEncodeError) as e: try: sTransformed = sOriginal.encode('utf-16') #sTransformed = sOriginal.decode('utf-16') except (ValueError, UnicodeDecodeError, UnicodeEncodeError) as e: warning('func verifyEncoding(): failed to transform sOriginal:', sOriginal, 'with utf-8, latin-1 and utf-16', e) sTransformed = '' return sTransformed
def verifyEncoding(sOriginal, bDecode=True, bEncode=False): sTransformed = '' try: if bEncode: sTransformed = sOriginal.encode('utf-8') elif bDecode: sTransformed = sOriginal.decode('utf-8') except (ValueError, UnicodeDecodeError, UnicodeEncodeError) as e: try: if bEncode: sTransformed = sOriginal.encode('latin-1') elif bDecode: sTransformed = sOriginal.decode('latin-1') except (ValueError, UnicodeDecodeError, UnicodeEncodeError) as e: try: if bEncode: sTransformed = sOriginal.encode('utf-16') elif bDecode: sTransformed = sOriginal.decode('utf-16') except (ValueError, UnicodeDecodeError, UnicodeEncodeError) as e: warning( 'func verifyEncoding(): failed to transform sOriginal:', sOriginal, 'with utf-8, latin-1 and utf-16', e) traceback.print_exc() sTransformed = '' return sTransformed
def changePerms(tTup): (sProjectPath, dConfig) = tTup sLatestDir = os.path.join(sProjectPath, 'latest') if os.path.exists(sLatestDir): if os.path.isdir(sLatestDir): # project-path/latest exists as a directory if dConfig['debug']: debug('func: changePerms()', 'changing directory permissions on', sLatestDir) # change directory permissions to 555 os.system('find ' + sLatestDir + ' -type d -exec chmod 555 \'{}\' \;') if dConfig['debug']: debug('func: changePerms()', 'changing file permissions on', sLatestDir) # change file permissions to 4444 os.system('find ' + sLatestDir + ' -type f -exec chmod 444 \'{}\' \;') else: warning( 'func changePerms() latest exists but is not a directory under path:', sLatestDir) else: warning('func changePerms() latest does not exist under path:', sProjectPath, 'at', sLatestDir)
def usage(): warning( 'Usage: changeCorpusLatestPerms.py --corpus-dir-path=/data/corpus --forks=5 --redis --debug' ) warning( 'Usage: Please note that above directory arguments are defaults if not supplied and both directories must exist on the filesystem.' ) warning( 'Usage: if mode is supplied, it must be either set to thread or process. thread is the default.' )
def findProjectFiles(dConfig): qRedis = RedisQueue(dConfig['redis-queue-name'], namespace='queue', host=dConfig['redis-loc']) oES = Elasticsearch(dConfig['es-instance-locs']) lIgnoreDirs = ['.git', '.svn'] dProject = {} dSource = {} dProject['_index'] = dConfig['es-index-name'] dProject['_type'] = dConfig['es-index-type'] dSource['crawl-time'] = dConfig['time-stamp'] dSource['project-path'] = qRedis.get(block=True) lProjectFiles = [] # if project path is '**done**', then break while dSource['project-path'] != '**done**': dSource['project-name'] = os.path.basename(dSource['project-path']) if dConfig['debug']: debug('func: findProjectFiles()', 'project-path:', dSource['project-path'], dSource['project-name']) for sRoot, lDirs, lFiles in os.walk(dSource['project-path']): if len(lProjectFiles) > dConfig['es-bulk-chunk-size']: # ingest chunk into elasticsearch helpers.bulk(oES, lProjectFiles) if dConfig['debug']: debug('func: findProjectFiles()', str(len(lProjectFiles)), 'files loaded into elasticsearch') lProjectFiles = [] for sFile in lFiles: sFilePath = os.path.join(sRoot, sFile) sRelPath = os.path.relpath(sFilePath, dSource['project-path']) dFile = {} try: sRelPath.decode('utf-8') except (ValueError, UnicodeDecodeError) as e: try: sRelPath.decode('latin-1') except (ValueError, UnicodeDecodeError) as e: try: sRelPath.decode('utf-16') except (ValueError, UnicodeDecodeError) as e: warning( 'func findProjectFiles():', 'sProjectPath:', dSource['project-path'], 'sProjectName:', dSource['project-name'], 'sFile:', sFile, 'sRelPath:', sRelPath, 'utf-8, latin-1, and utf-16 decoding failed', 'exception:', e) else: dSource['file'] = sFile.decode('utf-16') dSource['path'] = sRelPath.decode('utf-16') dProject['_source'] = dSource lProjectFiles.append(dProject) else: dSource['file'] = sFile.decode('latin-1') dSource['path'] = sRelPath.decode('latin-1') dProject['_source'] = dSource lProjectFiles.append(dProject) else: dSource['file'] = sFile dSource['path'] = sRelPath dProject['_source'] = dSource lProjectFiles.append(dProject) lDirs[:] = [sDir for sDir in lDirs if sDir not in lIgnoreDirs] # get next project to process dSource['project-path'] = qRedis.get(block=True) # index any remaining projects if len(lProjectFiles) > 0: # ingest chunk into elasticsearch helpers.bulk(oES, lProjectFiles) if dConfig['debug']: debug('func: findProjectFiles()', str(len(lProjectFiles)), 'files loaded into elasticsearch')
def findProjectFiles(tTup): (sProjectPath, oES, dConfig) = tTup sProjectName = os.path.basename(sProjectPath) oES = Elasticsearch(dConfig['es-instance-locs']) lIgnoreDirs = ['.git', '.svn'] lProjectFiles = [] if dConfig['debug']: debug('func: findProjectFiles()', 'project-path:', sProjectPath, 'project-name:', sProjectName) for sRoot, lDirs, lFiles in os.walk(sProjectPath): if len(lProjectFiles) > dConfig['es-bulk-chunk-size']: # ingest chunk into elasticsearch (iSuccess, lResponse) = helpers.bulk(client=oES, actions=lProjectFiles, timeout="20m", request_timeout=120.) if iSuccess < dConfig['es-bulk-chunk-size']: warning('func: findProjectFiles() iSuccess:', iSuccess, ' expected:', dConfig['es-bulk-chunk-size']) warning('func: findProjectFiles()', type(lResponse), 'returned by bulk api') warning('func: findProjectFiles()', json.dumps(lResponse, indent=4), 'returned by bulk api') #del lProjectFiles[0 : len(lProjectFiles)] lProjectFiles = [] if dConfig['debug']: debug('func: findProjectFiles()', str(len(lProjectFiles)), 'files loaded into elasticsearch') for sFile in lFiles: # make sure dProject is emptied each loop iteration dProject = { '_index': dConfig['es-index-name'], '_type': dConfig['es-index-type'], '_source': { 'project-path': sProjectPath, 'project-name': sProjectName, 'crawl-time': dConfig['time-stamp'] } } sFilePath = os.path.join(sRoot, sFile) sRelPath = os.path.relpath(sFilePath, sProjectPath) sDecodedFile = '' sDecodedRelPath = '' sEncodedWith = '' # Look for the tar file with the src code if "_code.tgz" in sFilePath: global counter counter = counter + 1 print(str(counter) + ': working on: ' + sFilePath) t = tarfile.open(sFilePath, 'r:*') # Iterate over the files in the tar file gz for tarinfo in t: if tarinfo.isfile(): filename = tarinfo.name if (".svn" not in filename and ".git" not in filename): # make sure dProject is emptied each loop iteration dProject = { '_index': dConfig['es-index-name'], '_type': dConfig['es-index-type'], '_source': { 'project-path': sProjectPath, 'project-name': sProjectName, 'crawl-time': dConfig['time-stamp'] } } # append file in tar to tar path sFile = os.path.join(sFilePath, filename) sRelPath = os.path.relpath(sFile, sProjectPath) sDecodedFile = '' sDecodedRelPath = '' sEncodedWith = '' try: sDecodedFile = sFile.decode('utf-8') sDecodedRelPath = sRelPath.decode('utf-8') sEncodedWith = 'utf-8' except (ValueError, UnicodeDecodeError) as e: try: sDecodedFile = sFile.decode('latin-1') sDecodedRelPath = sRelPath.decode( 'latin-1') sEncodedWith = 'latin-1' except (ValueError, UnicodeDecodeError) as e: try: sDecodedFile = sFile.decode('utf-16') sDecodedRelPath = sRelPath.decode( 'utf-16') sEncodedWith = 'utf-16' except (ValueError, UnicodeDecodeError) as e: warning( 'func findProjectFiles():', 'sProjectPath:', dProject['_source'] ['project-path'], 'sProjectName:', dProject['_source'] ['project-name'], 'sFile:', sFile, 'sRelPath:', sRelPath, 'utf-8, latin-1, and utf-16 decoding failed', 'exception:', e) print("decode failed") sDecodedFile = '' sDecodedRelPath = '' sEncodedWith = '' if sDecodedFile and sDecodedRelPath: dProject['_source']['file'] = sDecodedFile (_, sFileExt) = os.path.splitext(sDecodedFile) if sFileExt: dProject['_source']['ext'] = sFileExt[ 1:].lower() dProject['_source']['path'] = sDecodedRelPath if dConfig['debug']: debug('func: findProjectFiles() dProject:', dProject, 'encoded with', sEncodedWith) lProjectFiles.append(dProject) lDirs[:] = [sDir for sDir in lDirs if sDir not in lIgnoreDirs] # ingest any stragglers remaining into elasticsearch (iSuccess, lResponse) = helpers.bulk(client=oES, actions=lProjectFiles, timeout="20m", request_timeout=120.) if iSuccess < len(lProjectFiles): warning('func: findProjectFiles() iSuccess:', iSuccess, ' expected:', len(lProjectFiles)) warning('func: findProjectFiles()', type(lResponse), 'returned by bulk api') warning('func: findProjectFiles()', json.dumps(lResponse, indent=4), 'returned by bulk api') # del lProjectFiles[0 : len(lProjectFiles)] lProjectFiles = []
def usage(): warning( 'Usage: buildProjectsByType.py --queue-projects=\"configure.ac\" --os=\"ubuntu14" --rebuild --debug' )
def usage(): warning('Usage: natesProjects.py --debug')
def parseBuildOutput(dArgs, bDebug=False): dFiles = { 'buildTime': 'runtime.log', 'numObjectsPreBuild': 'numObjectsPreBuild.log', 'numObjectsPostBuild': 'numObjectsPostBuild.log', 'numObjectsGenerated': 'numObjectsGenerated.log', 'numSources': 'numSources.log' } dBuffer = {} for sFileType, sFileName in dFiles.iteritems(): sFileName = os.path.join(dArgs['dirs']['output'], sFileName) if os.path.isfile(sFileName): with open(sFileName, 'r') as fBuilderFile: # get file input and trim unnecessary whitespace before/after dBuffer[sFileType] = (fBuilderFile.read()).strip() else: dBuffer[sFileType] = '' warning('func: parseBuildOutput() sFileType: ', sFileType, ' missing for project:', dArgs['projectName'], 'container:', dArgs['containerName']) dBuffer['targets'] = [] dTargetSpecificFiles = { 'returnCode': 'retcode.log' #, #'stdout' : 'stdout.log', #'stderr' : 'stderr.log' } iCtr = 0 lRetCodes = [] for dTarget in dArgs['targets']: for sFileType, sFileName in dTargetSpecificFiles.iteritems(): sFileName = os.path.join(dArgs['dirs']['output'], sFileName + '.' + str(iCtr)) if os.path.isfile(sFileName): dTarget[sFileType] = '' with open(sFileName, 'r') as fBuilderFile: # get file input and trim unnecessary whitespace before/after dTarget[sFileType] = (fBuilderFile.read()).strip() ''' for sLine in fBuilderFile: dTarget[sFileType] += verifyEncoding(sLine) + '\n' ''' if 'returnCode' in dTarget and isInt(dTarget['returnCode']): lRetCodes.append(int(dTarget['returnCode'].strip())) else: warning( 'func: parseBuildOutput() invalid return code encountered:', json.dumps(dTarget, indent=4), 'project:', dArgs['projectName'], 'container:', dArgs['containerName']) dTarget['returnCode'] = 666 lRetCodes.append(666) dBuffer['targets'].append(dTarget) iCtr += 1 if len(lRetCodes) > 0: dBuffer['returnCode'] = str(max(lRetCodes)) else: dBuffer['returnCode'] = '666' if bDebug: debug('func: parseBuildOutput() dBuffer:', json.dumps(dBuffer, indent=4)) return dBuffer
def processBuildTargets(tTup): try: (iContainerId, dArgs, dConfig) = tTup # dual queues -- primary for getting what project to build next, secondary to mark what is being built qRedis = RedisQueue(name=dConfig['redis-queue-to-build'], name2=dConfig['redis-queue-building'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) # set of existing builds for this os container used to prune out projects already built with this container sExistingBuilds = RedisSet(name=dConfig['redis-already-built-nate'], namespace='set', host=dConfig['redis-loc'], port=dConfig['redis-port']) debug('func: processBuildTargets(), has ' + str(len(sExistingBuilds)) + ' built projects') iCtr = 0 while 1: sBuildTarget = qRedis.getnpush(block=True, timeout=30) #sBuildTarget = qRedis.peek() # debug(sBuildTarget) if sBuildTarget: if dConfig['debug']: debug('func: processBuildTargets() sBuildTarget:', sBuildTarget) dBuildTarget = json.loads(sBuildTarget) # initial setup #if 'projectName' not in dBuiltTarget: continue dArgs['projectName'] = dBuildTarget['projectName'] if dArgs['projectName'] in sExistingBuilds: warning('func: processBuildTargets() project:', dArgs['projectName'], ' already built... skipping...') continue #sProjectPath = os.path.relpath(dBuildTarget['projectPath'], '/data/corpus') #sProjectPath = os.path.join('/nfsbuild/nfsbuild', sProjectPath) #dArgs['buildPath'] = sProjectPath dArgs['targets'] = dBuildTarget['targets'] if dConfig['debug']: debug('func: processBuildTargets() targets:', json.dumps(dArgs['targets'], indent=4)) dArgs['containerId'] = str(iContainerId) dArgs[ 'containerName'] = dConfig['containerImage'] + '-' + dArgs[ 'containerOS'] + '-' + dConfig['hostname'] + '_' + str( iContainerId) dArgs['dirs'] = {} dArgs['dirs']['root'] = os.path.join(dConfig['containerPath'], dArgs['containerName']) for sDir in dArgs['containerDirs']: dArgs['dirs'][sDir] = os.path.join(dArgs['dirs']['root'], sDir) # /data/corpus on muse2 is mounted under /nfscorpus/nfscorpus on all 3 servers (via mount-bind on muse2 and NFS on muse1 and muse3) debug('projectPath: ', dBuildTarget['projectPath']) if "_8tof" in dBuildTarget['projectPath']: sProjectPath = os.path.relpath(dBuildTarget['projectPath'], '/data/corpus_8tof') sBuildPath = os.path.join('/data/builder_SAN/outputCyber', sProjectPath) sProjectPath = os.path.join('/data/corpus_8tof', sProjectPath) if "_0to7" in dBuildTarget['projectPath']: sProjectPath = os.path.relpath(dBuildTarget['projectPath'], '/data/corpus_0to7') sBuildPath = os.path.join('/data/builder_SAN/outputCyber', sProjectPath) sProjectPath = os.path.join('/data/corpus_0to7', sProjectPath) debug('projectPathDone: ', sProjectPath) dArgs['buildPath'] = sBuildPath ''' # determine code root in project directory sCodePath = dBuildTarget['buildTargetPath'] if sCodePath.startswith('./'): sCodePath = dBuildTarget['buildTargetPath'][2:] sCodeRoot = sCodePath[:sCodePath.index(os.sep)] if os.sep in sCodePath else sCodePath ''' plist = sProjectPath.split('/') uuid = plist[len(plist) - 1] tar = uuid + ("_code.tgz") debug('tarball: ', tar) dArgs['projectPath'] = os.path.join(sProjectPath, tar) # add code root to project path # if dBuildTarget['codeDir']: # print('none') #dArgs['projectPath'] = os.path.join(sProjectPath, dBuildTarget['codeDir']) # else: # warning('func: processBuildTargets() encountered project:', dBuildTarget['projectName'], ' with empty or NULL codeDir which is not supported. Project build skipped...') # continue sTimeStamp = datetime.datetime.now().strftime('%Y%m%dT%H%M%S') dArgs['jsonName'] = 'build-' + sTimeStamp + '.json' dArgs['tarName'] = dArgs[ 'projectName'] + '-' + sTimeStamp + '.tgz' dArgs['version'] = dBuildTarget['version'] # setup container makeDirs(dArgs=dArgs, bDebug=dConfig['debug']) copySource(dArgs=dArgs, bDebug=dConfig['debug']) copyScripts(dArgs=dArgs, bDebug=dConfig['debug']) createBuildPlanScript(dArgs=dArgs, bDebug=dConfig['debug']) recordProjectName(dArgs=dArgs, bDebug=dConfig['debug']) startBuild(dArgs=dArgs, bDebug=dConfig['debug']) # sleep until build completes while pollBuild(dArgs=dArgs, bDebug=dConfig['debug']): if dConfig['debug']: debug( 'func: processBuildTargets() build not completed... sleeping' ) time.sleep(10) # get container logs getBuildLogs(dArgs=dArgs, bDebug=dConfig['debug']) # get build output dBuffer = parseBuildOutput(dArgs=dArgs, bDebug=dConfig['debug']) # index build output postBuildStatusUpdates(dArgs=dArgs, dBuffer=dBuffer, dConfig=dConfig) # archive build artifacts tarUpContainerDirs(dArgs=dArgs, bDebug=dConfig['debug']) # remove container removeContainer(dArgs=dArgs, bDebug=dConfig['debug']) # remove project from "building" queue # qRedis.done(value=sBuildTarget) iCtr += 1 if dConfig['debug'] and iCtr >= 1: break else: break if dConfig['debug']: debug( 'func: processBuildTargets() sBuildTarget is either empty or none, likely since the redis queue is empty' ) debug('func: processBuildTargets() redis queue size:', qRedis.size()) debug('func: processBuildTargets() exiting...') except Exception as e: warning('Caught exception in worker thread:', iContainerId) traceback.print_exc() raise e
def usage(): warning('Usage: createJsonReports.py --forks=5 --debug')
def indexSourceTargets(dConfig): # setup mysql client dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc']) dMp.open() # setup elasticsearch client oES = Elasticsearch(dConfig['es-instance-locs']) # setup source targets queue qRedis = RedisQueue(dConfig['redis-queue-source-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) while 1: sQuery = qRedis.get(block=True, timeout=30) if sQuery: dQuery = json.loads(sQuery) if dConfig['debug']: debug( 'func: indexSourceTargets() dQuery:', json.dumps(dQuery) ) lSourceFiles = [] # scroll time set to 10 minutes, change as needed -- required for consistent results, the scroll token expires at the end of scroll time dResponse = oES.search(index=dConfig['es-file-index-name'], doc_type=dConfig['es-file-index-type'], body=json.dumps(dQuery), search_type='scan', scroll='20m', timeout='20m', lowercase_expanded_terms=False) sScrollId = dResponse['_scroll_id'] if dConfig['debug']: debug('func: indexSourceTargets() (after initial search) dResponse: ', dResponse) if dConfig['debug']: debug('func: indexSourceTargets() search hits: ', dResponse['hits']['total']) #while not dResponse['timed_out'] and dResponse['hits']['hits']['total'] > 0: while 'timed_out' in dResponse and not dResponse['timed_out'] and 'hits' in dResponse and 'total' in dResponse['hits'] and dResponse['hits']['total'] > 0: dResponse = oES.scroll(scroll_id=sScrollId, scroll='20m') sScrollId = dResponse['_scroll_id'] if ('hits' in dResponse['hits']) and (len(dResponse['hits']['hits']) > 0): if dConfig['debug']: debug('func: indexSourceTargets() scroll_id:', sScrollId, 'number of hits:', len(dResponse['hits']['hits']) ) for dHit in dResponse['hits']['hits']: # found matches try: if '_source' in dHit: # debug('func: indexSourceTargets() dHit:', json.dumps(dHit['_source']) ) #NATE added, remove leading path from found built targets mBuildTarget=dHit['_source']['file']; mBuildTarget=mBuildTarget.split('/') dHit['_source']['file'] = mBuildTarget[len(mBuildTarget)-1] dProjectFound = {} lSourceTypes = dMp.getSourceTypes() for sSourceType in lSourceTypes: dProjectFound[sSourceType] = False if 'file' in dHit['_source'] and dHit['_source']['file']: (sFileName, sFileExt) = os.path.splitext(dHit['_source']['file']) if sFileExt.lower() in dConfig['source-targets'].keys(): dProjectFound[ dConfig['source-targets'][ sFileExt.lower() ] ] = True else: warning( 'func indexSourceTargets() es returned an improper source target:', json.dumps(dHit['_source']) ) continue if 'project-name' in dHit['_source'] and dHit['_source']['project-name']: dProjectFound['projectName'] = dHit['_source']['project-name'] if 'project-path' in dHit['_source'] and dHit['_source']['project-path']: dProjectFound['projectPath'] = dHit['_source']['project-path'] if 'path' in dHit['_source'] and dHit['_source']['path']: dProjectFound['buildTargetPath'] = verifyEncoding( dHit['_source']['path'] ) # debug('func findSourceFileHelper()', json.dumps(dProjectFound)) lSourceFiles.append(dProjectFound) # causing es reads to time out if (len(lSourceFiles) > dConfig['mysql-bulk-statement-size']) and dConfig['mysql']: dMp.insertIntoSourceTargets(lTargets=lSourceFiles, bDebug=dConfig['debug']) printMsg('func indexSourceTargets() loaded', iCtr, 'source targets') lSourceFiles = [] except (UnicodeDecodeError, UnicodeEncodeError) as e: warning('func indexSourceTargets() encountered exception:', e) #warning('func indexSourceTargets() with string: ', dHit['_source']['path']) warning('func indexSourceTargets() full _source payload: ', json.dumps( dHit['_source'], indent=4 ) ) else: break if (len(lSourceFiles) > 0) and dConfig['mysql']: dMp.insertIntoSourceTargets(lTargets=lSourceFiles, bDebug=dConfig['debug']) lSourceFiles = [] else: break dMp.close()
def usage(): warning('Usage: labelProjectsByBuildType.py --forks=5 --debug')
def usage(): warning('Usage: testLocks.py')
def findBuildTargets(dConfig): # setup mysql client dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc']) dMp.open() # setup elasticsearch client oES = Elasticsearch(dConfig['es-instance-locs'],timeout=180, max_retries=3, retry_on_timeout=True ) # purge build targets queue -- considering if we need to split mysql ingestion from elasticsearch queries... mysql may benefit from consumer pool inserting statements concurrently # qRedis = RedisQueue(dConfig['redis-queue-build-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) lBuildFiles = [] iCtr = 0 dQuery = { "query": { "bool": { "must": [ { "bool": { "should": [ { "wildcard": { "file.raw": "*/configure.ac" } }, { "wildcard": { "file.raw": "*/configure.in" } }, { "wildcard": { "file.raw": "*/configure" } }, { "wildcard": { "file.raw": "*/CMakeLists.txt" } }, { "wildcard": { "file.raw": "*/Makefile" } } ] } }, { "bool": { "should": [ { "match": { "path": "latest/*" } }, { "match": { "path": "content/*"} } ] } }, {"wildcard":{"file.raw": "/data/corpus_8tof/*"}} ] } } } if dConfig['debug']: debug( 'func: findBuildFiles() dQuery:', json.dumps(dQuery) ) # scroll time set to 10 minutes, change as needed -- required for consistent results, the scroll token expires at the end of scroll time dResponse = oES.search(index=dConfig['es-file-index-name'], doc_type=dConfig['es-file-index-type'], body=json.dumps(dQuery), search_type='scan', scroll='20m', timeout='20m', lowercase_expanded_terms=False, request_timeout=180,) sScrollId = dResponse['_scroll_id'] if dConfig['debug']: debug('func: findBuildFiles() (after initial search) dResponse: ', dResponse) if dConfig['debug']: debug('func: findBuildFiles() search hits: ', dResponse['hits']['total']) debug('func: findBuildFiles() search hits: ', dResponse['hits']['total']) while 'timed_out' in dResponse and not dResponse['timed_out'] and 'hits' in dResponse and 'total' in dResponse['hits'] and dResponse['hits']['total'] > 0: dResponse = oES.scroll(scroll_id=sScrollId, scroll='20m') sScrollId = dResponse['_scroll_id'] if ('hits' in dResponse['hits']) and (len(dResponse['hits']['hits']) > 0): if dConfig['debug']: debug('func: findBuildFiles() scroll_id:', sScrollId, 'number of hits:', len(dResponse['hits']['hits']) ) if dConfig['debug'] and iCtr > 10: break for dHit in dResponse['hits']['hits']: iCtr += 1 if dConfig['debug'] and iCtr > 10: break # found matches try: if '_source' in dHit: #NATE added, remove leading path from found built targets mBuildTarget=dHit['_source']['file']; mBuildTarget=mBuildTarget.split('/') dHit['_source']['file'] = mBuildTarget[len(mBuildTarget)-1] dProjectFound = {} # initialize all build target types to false lBuildTypes = dMp.getBuildTypes() for sBuildType in lBuildTypes: dProjectFound[sBuildType] = False # mark relevant build target type true if 'file' in dHit['_source'] and dHit['_source']['file'] and dHit['_source']['file'] in dConfig['build-targets'].keys(): if dConfig['debug']: debug('func findBuildFiles() returned build target:', dHit['_source']['file']) dProjectFound[ dConfig['build-targets'][ dHit['_source']['file'] ]['type'] ] = True dProjectFound['ranking'] = dConfig['build-targets'][ dHit['_source']['file'] ]['ranking'] else: warning( 'func findBuildFiles() es returned an improper build target:', json.dumps(dHit['_source']) ) continue if 'project-name' in dHit['_source'] and dHit['_source']['project-name']: dProjectFound['projectName'] = dHit['_source']['project-name'] if 'project-path' in dHit['_source'] and dHit['_source']['project-path']: dProjectFound['projectPath'] = dHit['_source']['project-path'] if 'path' in dHit['_source'] and dHit['_source']['path']: dProjectFound['buildTargetPath'] = verifyEncoding( dHit['_source']['path'] ) dProjectFound['depth'] = depth( dProjectFound['buildTargetPath'] ) # debug('func findBuildFiles()', json.dumps(dProjectFound)) lBuildFiles.append(dProjectFound) # causing es reads to time out if (len(lBuildFiles) > dConfig['mysql-bulk-statement-size']) and dConfig['mysql']: dMp.insertIntoBuildTargets(lTargets=lBuildFiles, bDebug=dConfig['debug']) printMsg('func findBuildFiles() loaded', iCtr, 'build targets') lBuildFiles = [] except (UnicodeDecodeError, UnicodeEncodeError) as e: warning('func findBuildFiles() encountered exception:', e) #warning('func findBuildFiles() with string: ', dHit['_source']['path']) warning('func findBuildFiles() full _source payload: ', json.dumps( dHit['_source'], indent=4 ) ) else: break if (len(lBuildFiles) > 0) and dConfig['mysql']: dMp.insertIntoBuildTargets(lTargets=lBuildFiles, bDebug=dConfig['debug']) lBuildFiles = [] dMp.close()
def queueUpSourceTargets(dConfig): if dConfig['mysql'] and dConfig['redis']: dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc']) # setup to-build queue qRedis = RedisQueue(dConfig['redis-queue-to-build'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) dMp.open() # get projects first to iterate through (makes it easier to build project specific dictionaries), limit if in debug mode iProjectCount = 0 iTargetCount = 0 iMultiTargets = 0 sLimitClause = '' if dConfig['debug']: sLimitClause = '10' lLeadingPaths = [] dProject = {} dCodeDirLookup = {} lProjectRows = dMp.select(sSelectClause='projectName,codeDir', sTable='availableProjects', bDebug=dConfig['debug']) for tProjectRow in lProjectRows: (sProjectName, sCodeDir) = tProjectRow dCodeDirLookup[sProjectName] = sCodeDir lTargetRows = [] if dConfig['unBuiltProjectsOnly']: if dConfig['queueSite']: lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='unBuiltSourceTargetsWithSite', sWhereClause='site=\'' + dConfig['queueSite'] + '\'', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug']) else: lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='unBuiltSourceTargets', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug']) else: if dConfig['queueSite']: lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='availableSourceTargetsWithSite', sWhereClause='site=\'' + dConfig['queueSite'] + '\'', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug']) else: lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='availableSourceTargets', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug']) dMp.close() for tTargetRow in lTargetRows: dTarget = {} (sProjectName, sProjectPath, dTarget['buildTargetPath'], ) = tTargetRow (_, sFileExt) = os.path.splitext( os.path.basename(dTarget['buildTargetPath']) ) if sFileExt: sFileExt = sFileExt.lower() if sFileExt in dConfig['source-targets'].keys(): dTarget['buildType'] = dConfig['source-targets'][sFileExt] (sLeadingPath, sTarget) = os.path.split(dTarget['buildTargetPath']) # NATE remove leading tarball from path sLeadingPath = re.sub(r'[a-zA-Z_0-9-_]*.tgz/', "", sLeadingPath) dTarget['buildTargetPath'] = os.path.join(sLeadingPath, sTarget) # NATE added to grab code directory from buildTargetPath bPath=sLeadingPath.split('/') if len(bPath) > 1 : codedir2=bPath[0] iTargetCount += 1 if 'projectName' in dProject : if dProject['projectName'] != sProjectName: # new project encountered, push old project onto queue if dConfig['debug']: debug('func: queueUpSourceTargets() queuing project:', json.dumps(dProject, indent=4)) qRedis.put(json.dumps(dProject)) iProjectCount += 1 if len(lLeadingPaths) > 1: iMultiTargets += 1 dProject = { 'projectName': sProjectName, 'projectPath': sProjectPath, 'version': dConfig['version'], 'targets': [ dTarget ], 'codeDir': codedir2 #'codeDir': dCodeDirLookup[sProjectName] } lLeadingPaths = [ sLeadingPath ] else: if sLeadingPath not in lLeadingPaths: dProject['targets'].append(dTarget) lLeadingPaths.append(sLeadingPath) else: iTargetCount += -1 if dConfig['debug']: debug('func: queueUpSourceTargets() already encountered path:', sLeadingPath, 'not adding:', json.dumps(dTarget, indent=4)) else: dProject = { 'projectName': sProjectName, 'projectPath': sProjectPath, 'version': dConfig['version'], 'targets': [ dTarget ], 'codeDir': dCodeDirLookup[sProjectName] } lLeadingPaths = [ sLeadingPath ] else: warning('func: queueUpSourceTargets() unknown C/C++ file extension encountered:', sFileExt, 'file-path:',dTarget['buildTargetPath'],'for project:', sProjectName) else: warning('func: queueUpSourceTargets() missing file extension encountered file-path:') #,dTarget['buildTargetPath'],'for project:', sProjectName) if dConfig['debug']: debug('func: queueUpSourceTargets() queuing project:', json.dumps(dProject, indent=4)) qRedis.put(json.dumps(dProject)) iProjectCount += 1 if len(lLeadingPaths) > 1: iMultiTargets += 1 printMsg('func: queueUpSourceTargets()', str(iProjectCount), 'projects queued', str(iTargetCount), 'targets queued', str(iMultiTargets), 'multi-target projects queued') printMsg('func: queueUpSourceTargets()', qRedis.size(), 'projects reported by redis')
def processProjects(dConfig): qRedis = RedisQueue(dConfig['redis-queue-project-paths'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc']) dMp.open() lProjects = [] iCount = 0 while 1: sRoot = qRedis.get(block=True, timeout=30) if sRoot: dProject = { '_index': dConfig['es-project-index-name'], '_type': dConfig['es-project-index-type'], '_source': {} } dProject['_id'] = os.path.basename(sRoot) dProject['_source']['name'] = os.path.basename(sRoot) debug('func: processProjects() projects-root:', sRoot) if dConfig['debug']: debug('func: processProjects() projects-root:', sRoot) debug('func: processProjects() projects _id and _source[name] :', dProject['_id']) debug('func: processProjects() inserting project:', dProject['_source']['name']) if os.path.isfile( os.path.join(sRoot, 'filter.json') ): with open( os.path.join(sRoot, 'filter.json') ) as fProjectFilter: dProjectFilter = json.load(fProjectFilter) if 'hasBytecode' in dProjectFilter and dProjectFilter['hasBytecode'].lower() != 'none': dProject['_source']['bytecode_available'] = True if os.path.isfile( os.path.join(sRoot, 'index.json') ): with open( os.path.join(sRoot, 'index.json') ) as fProjectIndex: dProjectIndex = json.load(fProjectIndex) if dConfig['debug']: debug('func: processProjects() dProjectIndex.keys():', json.dumps(dProjectIndex.keys(), indent=4) ) ''' if 'bytecode_available' in dProjectIndex and dProjectIndex['bytecode_available']: dProject['_source']['bytecode_available'] = True ''' if 'code' in dProjectIndex: dProject['_source']['source'] = True dProject['_source']['codeDir'] = dProjectIndex['code'] if dProject['_source']['codeDir'].startswith('./'): dProject['_source']['codeDir'] = dProject['_source']['codeDir'][len('./'):] if 'site' in dProjectIndex: dProject['_source']['site'] = dProjectIndex['site'] if 'crawler_metadata' in dProjectIndex: for sMetaDataFile in dProjectIndex['crawler_metadata']: if 'languages.json' in sMetaDataFile: sLanguageFile = os.path.join(sRoot, sMetaDataFile) if os.path.isfile(sLanguageFile): with open(sLanguageFile) as fLanguageFile: dLanguageFile = json.load(fLanguageFile) if 'C' in dLanguageFile: dProject['_source']['c'] = dLanguageFile['C'] if 'C++' in dLanguageFile: dProject['_source']['cpp'] = dLanguageFile['C++'] if 'C#' in dLanguageFile: dProject['_source']['csharp'] = dLanguageFile['C#'] if 'Java' in dLanguageFile: dProject['_source']['java'] = dLanguageFile['Java'] if dConfig['debug']: debug('func: findProjects() dLanguageFile:', json.dumps(dLanguageFile, indent=4) ) else: warning('func: processProjects()', 'languages.json file listed in index.json but does not exist for project:', dProject['_source']['name'], 'at listed location:', sLanguageFile) else: warning('func: processProjects()', 'index.json not found for project:', dProject['_source']['name']) lProjects.append(dProject) iCount += 1 if (iCount % dConfig['mysql-bulk-statement-size']) == 0: dMp.insertIntoProjects(lProjects=lProjects, bDebug=dConfig['debug']) lProjects = [] if dConfig['debug'] and iCount >= 100: break else: break if dConfig['mysql']: if len(lProjects) > 0: dMp.insertIntoProjects(lProjects=lProjects, bDebug=dConfig['debug']) lProjects = [] dMp.close() return lProjects
def usage(): warning('Usage: identifyMultipleBuildTypeProjects.py')
def usage(): warning('Usage: queueProjectsToBuildByType.py --corpus-dir-path=/data/corpus_0to7 --forks=5 --analyze-projects --crawl-projects --unbuilt-projects-only --queue-projects --debug')