def extractRegexesFromJS(jsFile): """Extract regexes from this JS file. Returns a libLF.SimpleFileWithRegexes object. """ # Extract cmd = "'{}' '{}'".format(regexExtractor, jsFile) out = libLF.chkcmd(cmd) # Object-ify sfwr = libLF.SimpleFileWithRegexes() sfwr.initFromNDJSON(out) return sfwr
def runExtractor(sourceFile, extractor, registry): libLF.log('Extracting regexes from {} using {}'.format( sourceFile['name'], extractor)) # Any special invocation recipe? if extractor.endswith(".jar"): invocationPrefix = "java -jar" else: invocationPrefix = "" try: # Extract cmd = "{} '{}' '{}' 2>/dev/null".format(invocationPrefix, extractor, sourceFile['name']) out = libLF.chkcmd(cmd) try: sfwr = libLF.SimpleFileWithRegexes() sfwr.initFromNDJSON(out) if not sfwr.couldParse: libLF.log('Could not parse: {}'.format(sourceFile['name'])) # TODO ruList = libLF.sfwrToRegexUsageList(sfwr) ruList = [] for regex in sfwr.regexes: ru = libLF.RegexUsage() basePath = os.path.basename(sourceFile['name']) ru.initFromRaw(regex['pattern'], regex['flags'], None, None, sourceFile['name'], basePath) ruList.append(ru) libLF.log('Got {} regexes from {}'.format(len(ruList), sourceFile['name'])) return ruList except KeyboardInterrupt: raise except Exception as err: libLF.log( 'Error converting output from SFWR to RU: {}\n {}'.format( out, err)) except KeyboardInterrupt: raise except BaseException as err: libLF.log('Error extracting regexes from {} using {}: {}'.format( sourceFile['name'], extractor, err))
def main(tsFile): checkDependencies([transpiler, regexExtractor]) _, jsTmpFile = tempfile.mkstemp(suffix='.js') sfwr = libLF.SimpleFileWithRegexes() try: # Get regexes from JS version transpile(tsFile, jsTmpFile) sfwr = extractRegexesFromJS(jsTmpFile) # Tweak result a bit -- real file name, not temp file sfwr.fileName = tsFile # Clean up os.remove(jsTmpFile) except BaseException as err: libLF.log('Error: {}'.format(err)) sfwr.initFromRaw(fileName=tsFile, language='typescript', couldParse=0, regexes=[]) print(sfwr.toNDJSON())
def retrieveRegexes(regexOutputFileName): """Returns libLF.RegexUsage[] (Since regexOutputFileName contains regexes from multiple source files, multiple files are represented in the returned libLF.RegexUsage[]) Duplicates by <file, pattern> are removed. """ libLF.log("Loading regexes from {}".format(regexOutputFileName)) # Bin by file, removing duplicates file2uniqRegexes = {} # x[filename][pattern] = record with open(regexOutputFileName, mode='r') as regexStream: for line in regexStream: # Try to parse as NDJSON. # In Java we rely on a "poor man's JSON" implementation which may sometimes # produce malformed strings. In other languages, this should always work. try: obj = libLF.fromNDJSON(line) except: libLF.log("Could not fromNDJSON line: {}".format(line)) continue if obj['file'] not in file2uniqRegexes: file2uniqRegexes[obj['file']] = {} file2uniqRegexes[obj['file']][obj['pattern']] = \ { 'pattern': obj['pattern'], 'flags': obj['flags'] } # Convert to libLF.RegexUsage[] via libLF.SimpleFileWithRegexes ruList = [] for fileName in file2uniqRegexes: sfwr = libLF.SimpleFileWithRegexes().initFromRaw( fileName, "XXX", True, list(file2uniqRegexes[fileName].values()) ) ruList += libLF.sfwrToRegexUsageList(sfwr) return ruList
def main(rustc, rustFile, dumpTokenTree): libLF.checkShellDependencies([rustc]) if fileMightContainRegexes(rustFile): libLF.log('File might contain regexes, proceeding...') try: libLF.log('Getting token tree') tokenTree = getTokenTree(rustc, rustFile) except BaseException as err: libLF.log('Error getting token tree: {}'.format(err)) sys.exit(1) try: libLF.log('Walking token tree') visitor = FrontierVisitor() walkTokenTree(tokenTree, visitor) patterns = visitor.getRegexPatterns() libLF.log('Extracted {} patterns'.format(len(patterns))) except BaseException as err: libLF.log('Error walking token tree: {}'.format(err)) sys.exit(1) else: libLF.log('File does not contain "Regex", no regexes possible') patterns = [] regexes = [{'pattern': p, 'flags': ''} for p in patterns] sfwr = libLF.SimpleFileWithRegexes() sfwr.initFromRaw(fileName=rustFile, language='rust', couldParse=1, regexes=regexes) print(sfwr.toNDJSON()) if dumpTokenTree: # "Pretty" JSON makes it easier for humans to decode asJSON = json.dumps(tokenTree, indent=2, separators=(',', ':')) libLF.log('\n' + asJSON)