def loadRegexFile(regexFile, setStaticToAll): """Return a list of Regex's""" regexes = [] libLF.log('Loading regexes from {}'.format(regexFile)) with open(regexFile, 'r') as inStream: for line in inStream: line = line.strip() if len(line) == 0: continue try: # Build a Regex regex = libLF.Regex() regex.initFromNDJSON(line) # Filter if type(regex.pattern) is not str or len(regex.pattern) < 1: continue # Populate static langs used in if it is not set. # This should only be because it was not set during the LF project. if setStaticToAll: if len(regex.useCount_registry_to_nModules_static) != 0: raise ValueError("Error, you told me to setStaticToAll but it looks like static language use is non-empty") regex.useCount_registry_to_nModules_static = regex.useCount_registry_to_nModules regexes.append(regex) except KeyboardInterrupt: raise except BaseException as err: libLF.log('Exception parsing line:\n {}\n {}'.format(line, err)) traceback.print_exc() libLF.log('Loaded {} regexes from {}'.format(len(regexes), regexFile)) return regexes
def initFromNDJSON(self, jsonStr): obj = libLF.fromNDJSON(jsonStr) self.regex = libLF.Regex().initFromDict(obj['regex']) if 'slTimeout' in obj: self.slTimeout = obj['slTimeout'] else: self.slTimeout = self.MATCH_TIMEOUT_SEC if 'powerPumps' in obj: self.powerPumps = obj['powerPumps'] else: self.powerPumps = self.POW_PUMPS self.detectorOpinions = [ SLRegexDetectorOpinion().initFromDict(doDict) for doDict in obj['detectorOpinions'] ] # Get the lang_validPattern dict. # The keys are bools, easy conversion. self.lang_validPattern = obj['lang_validPattern'] # Get the lang_pump2timedOut dict. # The keys on pump2timedOut should be integers, but they may have been # converted to strings. Convert back again. self.lang_pump2timedOut = obj['lang_pump2timedOut'] for lang in self.lang_pump2timedOut: pump2timedOut = self.lang_pump2timedOut[lang] for k in pump2timedOut: if type(k) is str: pump2timedOut[int(k)] = pump2timedOut[k] del pump2timedOut[k]
def loadRegexFile(regexFile): """Return a list of Regex's""" regexes = [] libLF.log('Loading regexes from {}'.format(regexFile)) with open(regexFile, 'r') as inStream: for line in inStream: line = line.strip() if len(line) == 0: continue try: # Build a Regex regex = libLF.Regex() regex.initFromNDJSON(line) regexes.append(regex) except KeyboardInterrupt: raise except BaseException as err: libLF.log('Exception parsing line:\n {}\n {}'.format( line, err)) traceback.print_exc() libLF.log('Loaded {} regexes from {}'.format(len(regexes), regexFile)) return regexes
def main(internetPatternsFile, realPatternsFile, writingDifficultyThreshold): with ExitStack() as stack: internetPatternsStream = stack.enter_context( open(internetPatternsFile, 'r')) realPatternsStream = stack.enter_context(open(realPatternsFile, 'r')) internetPatternsDict = getInternetPatternsDict(internetPatternsStream) nRegexesMatchingInternetRegex = 0 nRegexes = 0 nRealRegexesAtLeastXDifficult = 0 for line in realPatternsStream: # Skip blank lines if re.match(r'^\s*$', line): continue try: regex = libLF.Regex().initFromNDJSON(line) nRegexes += 1 # Discard patterns that could be independently derived. if libLF.scorePatternWritingDifficulty( regex.pattern) < writingDifficultyThreshold: continue nRealRegexesAtLeastXDifficult += 1 if regex.pattern in internetPatternsDict: libLF.log( 'realPattern /{}/ matches internet source'.format( regex.pattern)) nRegexesMatchingInternetRegex += 1 else: if VERBOSE: libLF.log( 'realPattern /{}/ does not match internet source'. format(regex.pattern)) except Exception as e: libLF.log("Exception?: {}".format(e)) pass nInternetRegexesAtLeastXDifficult = 0 for pat in internetPatternsDict: if libLF.scorePatternWritingDifficulty( pat) < writingDifficultyThreshold: continue nInternetRegexesAtLeastXDifficult += 1 # Print summary print( '{}/{} real regexes matched any of the {} internet regexes (among the {} real regexes and {} internet regexes at least {} difficult)' .format(nRegexesMatchingInternetRegex, nRegexes, len(internetPatternsDict), nRealRegexesAtLeastXDifficult, nInternetRegexesAtLeastXDifficult, writingDifficultyThreshold))
def main(regexFiles, outFile): # Load nRegexUsages = 0 file2regexUsages = {} for f in regexFiles: file2regexUsages[f] = loadRegexUsages(f) nRegexUsages += len(file2regexUsages[f]) libLF.log('Loaded {} regexUsage\'s from {} files'.format( nRegexUsages, len(regexFiles))) # Identify unique regexes in each file. nPerFileUniquePatterns = 0 file2patterns = {} for f in file2regexUsages: file2patterns[f] = set([ru.pattern for ru in file2regexUsages[f]]) file2patterns[f].discard('DYNAMIC') nPerFileUniquePatterns += len(file2patterns[f]) libLF.log('{} unique patterns in {}:\n{}' \ .format(len(file2patterns[f]), f, pprint.pformat(sorted(file2patterns[f])))) libLF.log('Counting unique regexes per file, got {} unique regexes'.format( nPerFileUniquePatterns)) # Identify global unique regexes. uniqPatterns = set() for f in file2patterns: uniqPatterns |= file2patterns[f] libLF.log('Globally, got {} unique regexes'.format(len(uniqPatterns))) # Did we find any intersections among files? Pigeonhole principle. if len(uniqPatterns) < nPerFileUniquePatterns: perFileUniquePatterns = [ p for perFilePatterns in file2patterns.values() for p in perFilePatterns ] duplicates = set([ p for p in perFileUniquePatterns if perFileUniquePatterns.count(p) > 1 ]) libLF.log('{} regexes appeared in multiple files: {}'.format( len(duplicates), duplicates)) else: libLF.log('Each unique regex appeared in only 1 file'.format( nPerFileUniquePatterns - len(uniqPatterns))) # Emit regexes = [libLF.Regex().initFromRaw(p, {}, {}) for p in uniqPatterns] libLF.log('Emitting to {}'.format(outFile)) with open(outFile, 'w') as outStream: for regex in regexes: outStream.write(regex.toNDJSON() + '\n')
def main(regexFile, outFile, seed, nInputs, timeout): libLF.log('regexFile {} outFile {} seed {} nInputs {} timeout {}' \ .format(regexFile, outFile, seed, nInputs, timeout)) # Get the libLF.Regex with open(regexFile, 'r') as inStream: regex = libLF.Regex().initFromNDJSON(inStream.read()) libLF.log('Generating inputs for regex /{}/'.format(regex.pattern)) # Query Rex stringsByProducer = getRexInputs(regex.pattern, seed, nInputs, timeout) # Emit rpai = libLF.RegexPatternAndInputs().initFromRaw(regex.pattern, stringsByProducer) libLF.log('Rex generated {} unique inputs for regex /{}/ ({} including duplicates)' \ .format(len(rpai.getUniqueInputs()), regex.pattern, rpai.getNTotalInputs())) with open(outFile, 'w') as outStream: outStream.write(rpai.toNDJSON())
def main(regexFile, outFile, timeout): libLF.log('regexFile {} outFile {} timeout {}' \ .format(regexFile, outFile, timeout)) # Get the libLF.Regex with open(regexFile, 'r') as inStream: regex = libLF.Regex().initFromNDJSON(inStream.read()) libLF.log('Generating inputs for regex /{}/'.format(regex.pattern)) # Query ReScue mutRexInputs = getReScueInputs(regex.pattern, timeout) libLF.log('ReScue generated {} inputs for regex /{}/'.format( len(mutRexInputs), regex.pattern)) # Emit stringsByProducer = {"ReScue": mutRexInputs} with open(outFile, 'w') as outStream: rpai = libLF.RegexPatternAndInputs().initFromRaw( regex.pattern, stringsByProducer) outStream.write(rpai.toNDJSON())
def main(internetPatternsFile, realPatternsFile, writingDifficultyThreshold): with ExitStack() as stack: internetPatternsStream = stack.enter_context( open(internetPatternsFile, 'r')) realPatternsStream = stack.enter_context(open(realPatternsFile, 'r')) internetPatternsDict = getInternetPatternsDict(internetPatternsStream) nRegexesMatchingInternetRegex = 0 for line in realPatternsStream: # Skip blank lines if re.match(r'^\s*$', line): continue try: regex = libLF.Regex().initFromNDJSON(line) # Discard patterns that could be independently derived. if libLF.scorePatternWritingDifficulty( regex.pattern) < writingDifficultyThreshold: continue if regex.pattern in internetPatternsDict: libLF.log( 'realPattern /{}/ matches internet source'.format( regex.pattern)) nRegexesMatchingInternetRegex += 1 else: libLF.log( 'realPattern /{}/ does not match internet source'. format(obj['pattern'])) except: pass libLF.log('{} regexes matched internet sources'.format( nRegexesMatchingInternetRegex))
def run(self): try: # Obtain C# patterns libLF.log("Generating C# patterns") # Replace u flag with i for compatibility with C# and to preserve the # presence or absence of flags. csharpPatterns = [ libLF.RegexTranslator.translateRegex(regex.pattern, "", "C#", altUnicodeFlag='i') for regex in self.regexList ] for r, c in zip(self.regexList, csharpPatterns): libLF.log("MyTask: /{}/ -> /{}/".format(r.pattern, c)) # Run the analyses if AnalysisStages.ANALYZE_AUTOMATON in self.analyses: libLF.log("ANALYZE_AUTOMATON") automataMeasures = self.runAutomataCLI(csharpPatterns) if len(automataMeasures) and AnalysisStages.ANALYZE_SIMPLE_PATHS in self.analyses: libLF.log("ANALYZE_SIMPLE_PATHS") nSimplePathsList, averageOutDegreeDensityList = self.computeGraphMetrics(automataMeasures) else: libLF.log("{} automataMeasures, analyses {} -- skipping computeGraphMetrics".format(len(automataMeasures), self.analyses)) nSimplePathsList = [ -1 for i in range(len(self.regexList)) ] averageOutDegreeDensityList = [ -1 for i in range(len(self.regexList)) ] else: automataMeasures = [ {} for i in range(len(self.regexList)) ] nSimplePathsList = [ -1 for i in range(len(self.regexList)) ] averageOutDegreeDensityList = [ -1 for i in range(len(self.regexList)) ] if AnalysisStages.ANALYZE_WORST_CASE in self.analyses: libLF.log("ANALYZE_WORST_CASE") # Perform worst-case analysis on the C#-translated regexes regexes_csharp = [ libLF.Regex().initFromRaw(csharpPattern, {}, {}) for csharpPattern in csharpPatterns ] worstCaseSpencerList = self.predictWorstCaseSpencerPerformance(regexes_csharp) else: worstCaseSpencerList = [ libLF.SLRegexDetectorOpinion.PRED_COMPLEXITY_UNKNOWN for i in range(len(self.regexList)) ] libLF.log("Asserting lengths") assert(len(self.regexList) == len(csharpPatterns)) assert(len(self.regexList) == len(automataMeasures)) assert(len(self.regexList) == len(nSimplePathsList)) assert(len(self.regexList) == len(worstCaseSpencerList)) # Prep and return RegexMetrics[] libLF.log("Prepping regexMetricsList") regexMetricsList = [] for regex, csharpPattern, autMeasure, nSimplePaths, averageOutDegreeDensity, worstCaseSpencer in zip( self.regexList, csharpPatterns, automataMeasures, nSimplePathsList, averageOutDegreeDensityList, worstCaseSpencerList): # Prep members for a RegexMetrics csharpRegexLen = len(csharpPattern) if AnalysisStages.ANALYZE_AUTOMATON in self.analyses: validInCSharp = autMeasure['validCSharpRegex'] if validInCSharp: featureVector = autMeasure['featureVector'] automatonMetrics = autMeasure['automataMeasures'] else: featureVector = {} automatonMetrics = {} if AnalysisStages.ANALYZE_SIMPLE_PATHS not in self.analyses: nSimplePaths = -1 else: validInCSharp = False featureVector = {} automatonMetrics = {} # Misc metrics nDistinctFeaturesUsed = 0 for v in featureVector.values(): if v is not None and v > 0: nDistinctFeaturesUsed += 1 usesSuperLinearFeatures = False for abbrv in ["NLKA", "LKA", "NLKB", "LKB", "BKR"]: if abbrv in featureVector and featureVector[abbrv] > 0: usesSuperLinearFeatures = True regexMetrics = RegexMetrics( regex.pattern, regex.langsUsedInStatic(), regex.langsUsedInDynamic(), csharpPattern, csharpRegexLen, validInCSharp, featureVector, automatonMetrics, nSimplePaths, nDistinctFeaturesUsed, worstCaseSpencer, averageOutDegreeDensity, usesSuperLinearFeatures ) regexMetricsList.append(regexMetrics) libLF.log("Returning regexMetricsList") return regexMetricsList except BaseException as e: libLF.log("Uh oh, hit an exception") libLF.log(e) traceback.print_exc() return self.regexList
def wangPatternToLibLFRegex(pattern): return libLF.Regex().initFromRaw(pattern, {}, {})