def main(semanticsTestsFile, semanticOnly, performanceTestsFile, perfOnly): libLF.log('semanticsTestsFile {} semanticOnly, performanceTestsFile {} perfOnly {}' \ .format(semanticsTestsFile, semanticOnly, performanceTestsFile, perfOnly)) #### Check dependencies libLF.checkShellDependencies(shellDeps) #### Load and run each test summary = [] for testType, testsFile in [(TestSuite.SEMANTIC_TEST, semanticsTestsFile), (TestSuite.PERF_TEST, performanceTestsFile)]: if perfOnly and testType != TestSuite.PERF_TEST: continue if semanticOnly and testType != TestSuite.SEMANTIC_TEST: continue libLF.log("Loading {} tests from {}".format(testType, testsFile)) ts = TestSuite(testsFile, testType) libLF.log("Running {} tests".format(testType)) nFailures = ts.run() summary.append("{} tests from {}: {} failures".format( testType, testsFile, nFailures)) libLF.log("****************************************") for line in summary: libLF.log(" " + line)
def checkRegistryDependencies(registry): libLF.checkShellDependencies(['cloc'], mustBeExecutable=True) paths = [ registryToPaths[registry]['preprocessor'], *registryToPaths[registry]['instrumentor'].values(), registryToPaths[registry]['moduleRunner'] ] libLF.log("Checking paths for registry {}: {}".format(registry, paths)) libLF.checkShellDependencies(paths, mustBeExecutable=False)
def main(caseStudyFile): libLF.log('caseStudyFile {}' \ .format(caseStudyFile)) #### Check dependencies libLF.checkShellDependencies(shellDeps) #### Load data caseStudies = loadCaseStudies(caseStudyFile) #### Run studiesToRun = ["REWZWA-1", "REWZWA-2", "REWBR-2", "Microsoft", "Cloudflare"] studiesToRun = ["REWZWA-1", "REWZWA-2"] studiesToRun = ["REWBR-6"] nick2fname = {} for caseStudy in caseStudies: if caseStudy.nick in studiesToRun: nick2fname[caseStudy.nick] = caseStudy.run() for nick, fname in nick2fname.items(): libLF.log("Case study {} -- See {}".format(nick, fname))
def main(regexFile, outFile, parallelism): libLF.log('regexFile {} outFile {} parallelism {}' \ .format(regexFile, outFile, parallelism)) #### Check dependencies libLF.checkShellDependencies(shellDeps) #### Load data tasks = getTasks(regexFile) libLF.log('{} regexes'.format(len(tasks))) #### Process data # CPU-bound, no limits libLF.log('Submitting to map') results = libLF.parallel.map(tasks, parallelism, libLF.parallel.RateLimitEnums.NO_RATE_LIMIT, libLF.parallel.RateLimitEnums.NO_RATE_LIMIT, jitter=False) #### Emit results libLF.log('Writing results to {}'.format(outFile)) nSuccesses = 0 nExceptions = 0 with open(outFile, 'w') as outStream: for msa in results: # Emit if type(msa) is libMemo.MemoizationStaticAnalysis: nSuccesses += 1 outStream.write(msa.toNDJSON() + '\n') else: nExceptions += 1 libLF.log("Error message: " + str(msa)) libLF.log('Successfully performed MemoizationStaticAnalysis on {} regexes, {} exceptions'.format(nSuccesses, nExceptions)) #### Analysis # TODO Any preliminary analysis return
def main(rustc, rustFile, dumpTokenTree): libLF.checkShellDependencies([rustc]) if fileMightContainRegexes(rustFile): libLF.log('File might contain regexes, proceeding...') try: libLF.log('Getting token tree') tokenTree = getTokenTree(rustc, rustFile) except BaseException as err: libLF.log('Error getting token tree: {}'.format(err)) sys.exit(1) try: libLF.log('Walking token tree') visitor = FrontierVisitor() walkTokenTree(tokenTree, visitor) patterns = visitor.getRegexPatterns() libLF.log('Extracted {} patterns'.format(len(patterns))) except BaseException as err: libLF.log('Error walking token tree: {}'.format(err)) sys.exit(1) else: libLF.log('File does not contain "Regex", no regexes possible') patterns = [] regexes = [{'pattern': p, 'flags': ''} for p in patterns] sfwr = libLF.SimpleFileWithRegexes() sfwr.initFromRaw(fileName=rustFile, language='rust', couldParse=1, regexes=regexes) print(sfwr.toNDJSON()) if dumpTokenTree: # "Pretty" JSON makes it easier for humans to decode asJSON = json.dumps(tokenTree, indent=2, separators=(',', ':')) libLF.log('\n' + asJSON)
def main(regexFile, useCSharpToFindMostEI, perfPumps, maxAttackStringLen, queryPrototype, runSecurityAnalysis, nTrialsPerCondition, queryProductionEngines, timeSensitive, parallelism, outFile): libLF.log('regexFile {} useCSharpToFindMostEI {} perfPumps {} maxAttackStringLen {} queryPrototype {} runSecurityAnalysis {} nTrialsPerCondition {} queryProductionEngines {} timeSensitive {} parallelism {} outFile {}' \ .format(regexFile, useCSharpToFindMostEI, perfPumps, queryPrototype, maxAttackStringLen, runSecurityAnalysis, nTrialsPerCondition, queryProductionEngines, timeSensitive, parallelism, outFile)) #### Check dependencies libLF.checkShellDependencies(shellDeps) #### Load data taskConfig = TaskConfig(useCSharpToFindMostEI, queryPrototype, runSecurityAnalysis, queryProductionEngines) tasks = getTasks(regexFile, perfPumps, maxAttackStringLen, nTrialsPerCondition, taskConfig) nRegexes = len(tasks) #### Collect data df = None nSL = 0 nNonSL = 0 nExceptions = 0 nWorkers = 1 if timeSensitive else parallelism libLF.log("timeSensitive {}, so using {} workers".format( timeSensitive, nWorkers)) results = libLF.parallel.map(tasks, nWorkers, libLF.parallel.RateLimitEnums.NO_RATE_LIMIT, libLF.parallel.RateLimitEnums.NO_RATE_LIMIT, jitter=False) if runSecurityAnalysis: allSL = [res for res in results if res != MyTask.NOT_SL] nSucceeded = len([res for res in results if res]) nFailed = len([res for res in results if not res]) libLF.log("{} succeeded in sec'ty analysis, {} failed".format( nSucceeded, nFailed)) sys.exit(0) for t, res in zip(tasks, results): if type(res) is pd.DataFrame: nSL += 1 if df is None: df = res else: df = df.append(res) elif type(res) is type(MyTask.NOT_SL) and res == MyTask.NOT_SL: nNonSL += 1 else: libLF.log("Exception on /{}/: {}".format(t.regex.pattern, res)) nExceptions += 1 libLF.log("{} regexes were SL, {} non-SL, {} exceptions".format( nSL, nNonSL, nExceptions)) #### Emit results libLF.log('Writing results to {}'.format(outFile)) df.to_pickle(outFile) libLF.log("Data columns: {}".format(df.columns))
import os import sys import re sys.path.append('{}/lib'.format( os.environ['REGEX_GENERALIZABILITY_PROJECT_ROOT'])) import libLF import argparse import subprocess import sys import json import stat MAVEN_CLI = '/home/davisjam/local-install/apache-maven-3.6.0/bin/mvn' MAVEN_CLI = 'mvn' libLF.checkShellDependencies([MAVEN_CLI], mustBeExecutable=True) GRADLE_USER_HOME = '/tmp/.gradle' ######### # Classes to drive the Maven and Gradle build systems class BuildSystem: BUILD_SYSTEM_MAVEN = "maven" BUILD_SYSTEM_GRADLE = "gradle" def __init__(self): self.name = None self.cli = None self.buildFile = None
def checkRegistryDeps(registry): dependenciesToCheck = [] for l in registryToLangs[registry]: dependenciesToCheck.append(langToExtractorPath[l.lower()]) libLF.checkShellDependencies(dependenciesToCheck, mustBeExecutable=False)
AUTOMATACLI_BATCH_SIZE = 10 AUTOMATACLI_MAX_SECONDS_PER_REGEX = 5 AUTOMATACLI_TIMEOUT_SEC = AUTOMATACLI_BATCH_SIZE * AUTOMATACLI_MAX_SECONDS_PER_REGEX LIMIT_SIMPLE_PATHS = True SIMPLE_PATH_COUNT_LIMIT = 5000 # Based on a sample of 70K regexes, the distribution is heavily weighted towards 1-10 paths per regex. <=100 regexes fall above 50K simple paths. No need to exhaustively count for these outliers. SIMPLE_PATH_TIME_LIMIT = 5 # seconds # Dependencies WINDOWS_OS = os.name == 'nt' WINE_PATH = shutil.which("wine") AutomataCLI = os.path.join(os.environ['REGEX_GENERALIZABILITY_PROJECT_ROOT'], 'bin', 'AutomataCLI.exe') if WINDOWS_OS: # Workaround for broken symlink AutomataCLI = os.path.join(os.environ['REGEX_GENERALIZABILITY_PROJECT_ROOT'], 'measurement-instruments', 'automata', 'AutomataCLI.exe') else: libLF.checkShellDependencies([WINE_PATH], mustBeExecutable=True) libLF.checkShellDependencies([AutomataCLI], mustBeExecutable=False) # Control analysis class AnalysisStages: ANALYZE_AUTOMATON = 'automaton' ANALYZE_SIMPLE_PATHS = 'simple paths' ANALYZE_WORST_CASE = 'worst case' # Misc reg2lang = { 'npm': 'JavaScript', # TypeScript is evaluated on a JS engine 'crates.io': 'Rust', 'packagist': 'PHP', 'pypi': 'Python',
import subprocess import tempfile import argparse import shutil import shlex import re import subprocess ################ # Dependencies ################ RESCUE_PATH = os.path.join(os.environ['ECOSYSTEM_REGEXP_PROJECT_ROOT'], 'bin', 'ReScueInputGenerator.jar') libLF.checkShellDependencies([RESCUE_PATH], mustBeExecutable=False) CROSSOVER_PROBABILITY = 10 MUTATE_PROBABILITY = 10 ################ # Helpers ################ def getReScueInputs(pattern, timeout): """Return inputs: str[]""" # Use subprocess.run directly instead of libLF.runcmd because the regex is delivered on command line # and might be unescaped, contain newlines, etc. # Also, we want to be able to capture stderr cleanly with a timeout. libLF.log('Command: ' + str([
import subprocess import tempfile import argparse import shutil import shlex import re import subprocess ################ # Dependencies ################ BRICS_PATH = os.path.join(os.environ['ECOSYSTEM_REGEXP_PROJECT_ROOT'], 'bin', 'BricsInputGenerator.jar') libLF.checkShellDependencies([BRICS_PATH], mustBeExecutable=False) MAX_STRING_LEN = 128 PROB_EXCESSIVE_STRINGS = 0 ################ # Helpers ################ def convertPatternToBrics(pattern): """Convert to Brics style The Brics language is fairly minimal: http://www.brics.dk/automaton/doc/index.html?dk/brics/automaton/RegExp.html In particular, it supports *no* character classes like \d or \s.
import subprocess import tempfile import argparse import shutil import shlex import re import subprocess ################ # Dependencies ################ MUTREX_PATH = os.path.join(os.environ['ECOSYSTEM_REGEXP_PROJECT_ROOT'], 'bin', 'MutRexInputGenerator.jar') libLF.checkShellDependencies([MUTREX_PATH], mustBeExecutable=False) ################ # Helpers ################ def getMutRexInputs(pattern, timeout): """Return inputs: str[]""" # Build command to run cmd = ["java", "-jar", MUTREX_PATH, pattern] libLF.log('cmd: ' + " ".join(cmd)) # Get inputs, guarded by a timeout tmo = None if timeout < 0 else timeout
libLF.log('Config:\n INPUT_GENERATOR {}'.format(INPUT_GENERATOR)) langCLIDir = os.path.join(os.environ['ECOSYSTEM_REGEXP_PROJECT_ROOT'], 'bin') lang2cli = { 'go': os.path.join(langCLIDir, 'check-regex-behavior-in-go'), 'java': os.path.join(langCLIDir, 'check-regex-behavior-in-java.pl'), 'javascript': os.path.join(langCLIDir, 'check-regex-behavior-in-node.js'), 'perl': os.path.join(langCLIDir, 'check-regex-behavior-in-perl.pl'), 'php': os.path.join(langCLIDir, 'check-regex-behavior-in-php.php'), 'python': os.path.join(langCLIDir, 'check-regex-behavior-in-python.py'), 'ruby': os.path.join(langCLIDir, 'check-regex-behavior-in-ruby.rb'), 'rust': os.path.join(langCLIDir, 'check-regex-behavior-in-rust'), } libLF.log('Config:\n language CLIs: {}'.format(json.dumps(lang2cli))) libLF.checkShellDependencies([INPUT_GENERATOR] + list(lang2cli.values())) class MyTask(libLF.parallel.ParallelTask): def __init__(self, regex, maxInputsPerGenerator, rngSeed, timeoutPerGenerator): self.regex = regex self.maxInputsPerGenerator = maxInputsPerGenerator self.rngSeed = rngSeed self.timeoutPerGenerator = timeoutPerGenerator def _queryRegexInLang(self, pattern, queryFile, language): """Query behavior of <pattern, input[]> in language pattern: str: regex pattern queryFile: str: name of file containing the query to use