def get_command_list(cmd, verbose=False): try: parse = bashlex.parse(cmd) except bashlex.errors.ParsingError: return TokenOverlap.get_command_list_rule_based(cmd) except bashlex.tokenizer.MatchedPairError: return TokenOverlap.get_command_list_rule_based(cmd) except NotImplementedError: return TokenOverlap.get_command_list_rule_based(cmd) except AttributeError: return TokenOverlap.get_command_list_rule_based(cmd) command_list = [] if parse[0].kind == "pipeline": for node in parse[0].parts: if node.kind == "command": command_list.append(node) elif parse[0].kind == "command": command_list = [parse[0]] elif parse[0].kind == "list": if len(parse[0].parts) == 2: # command + operator for node in parse[0].parts: if node.kind == "command": command_list.append(node) else: if verbose: print "Skipped: ground truth contains multiple statements" # return TokenOverlap.get_command_list_rule_based(cmd) else: if verbose: print "Unrecognized node type: " + parse[0].kind # return TokenOverlap.get_command_list_rule_based(cmd) return [command for command in command_list if command]
def parse(self, script: str) -> Optional[AST]: try: script = script.replace('”', '"').replace('“', '"') return bashlex_ast_to_ast(script, bashlex.parse(script)[0], self.split_value) except Exception as e: # noqa return None
def eval_cmdline(cmdline, debug_file=None): for ast in bashlex.parse(cmdline.replace('\\\n', ' ')): dump = ast.dump() print(dump, file=debug_file) visitor = BashEvalVisitor(os.environ) visitor.visit(ast) sys.stdout.write(visitor.stdout.getvalue()) sys.stderr.write(visitor.stderr.getvalue()) print('$?: {}'.format(visitor.exitcode), file=debug_file)
def bash_to_gast(bash_input): input_ast = '' try: input_ast = bashlex.parse(bash_input) except: return 'Error: code could not compile' return bash_router.node_to_gast(input_ast)
def start(self, code): code = code.rstrip() self.code = code print(code) parts = bashlex.parse(code) self.markers = [] for node in parts: self.visit(node) return self.markers
def get_cmd_tokens(cmd): """ helper to unwrap the bashlex format into a flat list """ for _ in bashlex.parse(cmd): if _.kind == 'list': for __ in _.parts: yield __ else: yield _
def load_data_for_info_from_man_page(cmd_text): """ retrieve info about the currently selected cmd from the man page :param cmd_text: the bash cmd string :return: [True, structured list with info for each cmd and flags] [False, error string] """ # here the man search and parse parser = BashParser() # create a result var to fill flags_for_info_cmd = list() # parse the cmd string try: # the system may not have bashlex installed from bashlex import parse cmd_parsed = parse(cmd_text) except ImportError: return [False, "install bashlex to enable this"] except: return [False, "bashlex cannot read this command"] # find all flags for each commands parser.get_flags_from_bash_node(cmd_parsed, flags_for_info_cmd) # for each cmd and flag find the meaning from the man page man_parsed = ManParser() for item in flags_for_info_cmd: cmd_main = item[BashParser.INDEX_CMD] cmd_flags = item[BashParser.INDEX_FLAGS] if man_parsed.load_man_page(cmd_main[BashParser.INDEX_VALUE]): # save cmd meaning cmd_main[ BashParser.INDEX_MEANING] = man_parsed.get_cmd_meaning() # cmd meaning found in the man page if cmd_main[BashParser.INDEX_MEANING]: cmd_flags_updated = list() for flag_i in range(len(cmd_flags)): flag = cmd_flags[flag_i] flag[BashParser. INDEX_MEANING] = man_parsed.get_flag_meaning( flag[BashParser.INDEX_VALUE]) # if flag found in the man page if flag[BashParser.INDEX_MEANING]: cmd_flags_updated.append(flag) else: # try to check if flag is concatenated conc_flags = BashParser.decompose_possible_concatenated_flags( flag[BashParser.INDEX_VALUE]) for conc_flag in conc_flags: conc_flag_meaning = man_parsed.get_flag_meaning( conc_flag) cmd_flags_updated.append( [conc_flag, conc_flag_meaning]) # set the updated flags as new list of flags, the old list is deleted item[BashParser.INDEX_FLAGS] = cmd_flags_updated return [True, flags_for_info_cmd]
def _bashlex_wrapper(self, command): try: nodes = bashlex.parse(command) return nodes # bashlex fails to parse some syntax combinations # in that case split command by <space>, <comma>, <curly_brackets> # and add values in between directly to counter as keys except (bashlex.errors.ParsingError, NotImplementedError, TypeError): self.ERR += 1 rude_parse = [] for element in re.split(r" |,|{|}", command): rude_parse.append(element) return rude_parse
def bashlex_parse(src): src = open(fn).read() src, srcr = sub1('', src) src, srcr = sub2('\n', src.strip()) #sub3 = re.compile(r'(?<![{\(])(\s*\n\s*)+', re.M).subn #src, srcr = sub3(';', src.strip()) print(src) #src = """ #test -n "$scriptpath" || export scriptpath="$(pwd -P)" #""".strip() import bashlex parts = bashlex.parse(src) for ast in parts: print(ast.dump())
def extract(self, node): ''' Save the original command, with carated arguments escaped. ''' orig_text = node.text orig_text_safe = self._replace_carats(orig_text) ''' Make a copy of the node and split on <br> tags. ''' node_copy = copy.copy(node) for br in node_copy.select('br'): br.replace_with(RARE_CHARACTER) splittable_text = node_copy.text text_blocks = splittable_text.split(RARE_CHARACTER) regions = [] offset = 0 for text in text_blocks: text = self._replace_carats(text) text = self._replace_leading_redirects(text) text = self._clean_for_bashlex(text) if not text.isspace(): try: tree = bashlex.parse(text) valid_script = True except bashlex.errors.ParsingError: valid_script = False except Exception as e: valid_script = False logging.error("Bash parsing error: %s, for script %s", str(e), text) if valid_script: nodes = get_descendants(tree) commands = [n for n in nodes if n.kind == 'command'] for c in commands: if self._is_target_command( c, self.cmdname) and self._has_arguments(c): start_char = offset + self._get_start( c, self.cmdname) end_char = offset + c.pos[1] - 1 string = orig_text_safe[start_char:end_char + 1] r = Region(node, start_char, end_char, string) regions.append(r) offset += len(text) return regions
def start(self, code): self.command_range = {} self.markers = [] code = code.rstrip() self.code = code try: parts = bashlex.parse(code) except: # todo: handle bash parsing error # raise ValueError('Error parsing the bash script') return self.markers, self.command_range for node in parts: self.visit(node) return self.markers, self.command_range
def is_assignment(line): # if this is an assignment node we need to add # to env dict for later replacement in commands try: parts = bashlex.parse(line) except: # bashlex does not get along well with some inline # conditionals and may emit ParsingError # if that's the case, it's not an assigment, so move along return None for ast in parts: if ast.kind != 'compound': # ignore multi part commands for part in ast.parts: if part.kind == 'assignment': return part.word.split('=') return None
def get_arguments(command, cmd_pattern): ''' Given a single command, return a list of its arguments. ''' parse_tree = bashlex.parse(command) cmd_node = parse_tree[0] args = [] after_command = False for p in cmd_node.parts: if after_command and p.kind == 'word': args.append(p.word) elif not after_command: after_command = ( p.kind == 'word' and bool(re.match(cmd_pattern, p.word)) ) return args
def parse(doc): lines = doc.strip().split('\n') for i in range(len(lines)): lines[i] = lines[i].strip() _lines = lines lines = [] for line in _lines: if len(line) > 0: lines.append(line) parts = bashlex.parse('\n'.join(lines)) blk = odsh_ast.Block() for ast in parts: transformer = Transformer(blk) transformer.transform_node(ast) return blk
def test_parse(self): self.parser = bashParser.BashParser() test_list = [ [ "ls -a -b -l; ls -a -v ", [[['ls', None], [['-a', None], ['-b', None], ['-l', None], ['-v', None]]]] ], [ "sudo blkid -trv | grep swap -r", [[['blkid', None], [['-trv', None]]], [['grep', None], [['swap', None], ['-r', None]]]] ], [ "srm -lsv /media/cecss/ # comment", [[['srm', None], [['-lsv', None], ['/media/cecss/', None]]]] ], [ 'cat "$(ls)" -v', [[['cat', None], [['$(ls)', None], ['-v', None]]]] ], [ 'while true; do lsof /path/to/file; done;', [[['lsof', None], [['/path/to/file', None]]]] ], [ "echo -e '\e]8;;htts://twitter.com/\aTwitter link\e]8;;\a'", [[['echo', None], [['-e', None], [ '\\e]8;;htts://twitter.com/\x07Twitter link\\e]8;;\x07', None ]]]] ] # this break regex engine ] for test in test_list: result = list() cmd_parsed = bashlex.parse(test[0]) self.parser.get_flags_from_bash_node(cmd_parsed, result) self.assertEqual(result, test[1])
def _is_not_prose(self, cmdtext): url_count = 0 arg_count = 0 has_var = False command = bashlex.parse(cmdtext)[0] after_cmdname = False for part in command.parts: if after_cmdname: if hasattr(part, 'word'): if part.word.startswith('-'): arg_count += 1 else: url_count += 1 if part.word.startswith('$'): has_var = True if hasattr(part, 'word') and re.match(WGET_PATT, part.word): after_cmdname = True return has_var or arg_count > 0 or url_count == 1
def checkDepedencies(pipeline): # check existence of pipeline script if not os.access(pipeline, os.R_OK): raise IOError("pipeline %s was not found\n" % pipeline) if os.path.isdir(pipeline): raise IOError("The given input is a folder, and must be a script\n") # parse pipeline script with open(pipeline) as f: tree = ast.parse(f.read()) # list to store all statements = ''' <commands> ''' statements = [] # inspired by # https://docs.python.org/3/library/ast.html#module-ast # http://bit.ly/2rDf5xu # http://bit.ly/2r0Uv9t # really helpful, used astviewer (installed in a conda-env) to inspect examples # https://github.com/titusjan/astviewer for node in ast.walk(tree): statement = "" if is_cgat_statement(node) or \ is_cgat_executable(node) or \ is_cgat_executable_name(node): statement = get_cmd_string(node) elif is_cgat_append(node): statement = get_append_string(node) if len(statement) > 0 and not statement.startswith(' -'): #print(statement) statement = cleanup_statement(statement) statements.append(statement) # dictionary where: # key = program name # value = number of times it has been called deps = {} # set of names that are not proper deps exceptions = ['create', 'drop', 'select', 'attach', 'insert', 'module', '%s', 'tool', 'cmd-farm', 'cmd-sql', 'cmd_extract', 'cmds', 'compress', 'conda_env', 'filter_cmd', 'load_statement', 'match_executable', 'rscript', 'paired_processing', 'executable', 'transform', 'uncompress', 'execglam2', 'execglam2scan', 'CGATparameter', 'checkpoint', 'for'] for statement in statements: # use bashlex to parse statements commands = [] try: #print(statement) parts = bashlex.parse(statement) get_cmd_names(parts[0], commands) except bashlex.errors.ParsingError: pass for command in commands: #print(command) if command.lower() not in exceptions: if command not in deps: deps[command] = 1 else: deps[command] += 1 # list of unmet dependencies check_path_failures = [] # print dictionary ordered by value for k in sorted(deps, key=deps.get, reverse=True): if shutil.which(k) is None: check_path_failures.append(k) return deps, check_path_failures
# This file is used to parse the bash script # The input is a bash file. import sys sys.path.append("../bashlex") import bashlex if len(sys.argv) < 2: print("usage: %s <infile.sh>" % sys.argv[0]) exit(0) f = sys.argv[1] cmd = "" with open(f) as infile: lines = infile.readlines() for line in lines: cmd += line parts = bashlex.parse(cmd) for ast in parts: print(ast.dump())
import bashlex beep_cmd = "" parsed = bashlex.parse(beep_cmd)[0].parts[1:] beeps = [] current_structure = {"freq": 0, "len": 100, "delay": 0} next_is_val = (False, "") for x in parsed: word = x.word if word == "-n": beeps.append(current_structure) current_structure = {"freq": 0, "len": 100, "delay": 0} elif word == "-f": next_is_val = (True, "freq") elif word == "-l": next_is_val = (True, "len") elif word == "-D": next_is_val = (True, "delay") else: if not next_is_val[0]: print("invalid syntax") exit(1) current_structure[next_is_val[1]] = int(word) beeps.append(current_structure) for x in beeps: print("{.freq = " + str(x["freq"]) + ", .len = " + str(x["len"]) + ", .delay = " + str(x["delay"]) + "},")
import bashlex import json parts = bashlex.parse(''' ls / echo "Hello world!" && true ls / | grep etc ENV="aaa=b" exit (a && b) && c '''.strip()) for ast in parts: print(ast.dump())
def checkDepedencies(pipeline): # check existence of pipeline script if not os.access(pipeline, os.R_OK): raise IOError("Pipeline %s was not found\n" % pipeline) if os.path.isdir(pipeline): raise IOError("The given input is a folder, and must be a script\n") # parse pipeline script with open(pipeline) as f: tree = ast.parse(f.read()) # list to store all statements = ''' <commands> ''' statements = [] # inspired by # https://docs.python.org/3/library/ast.html#module-ast # http://bit.ly/2rDf5xu # http://bit.ly/2r0Uv9t # really helpful, used astviewer (installed in a conda-env) to inspect examples # https://github.com/titusjan/astviewer for node in ast.walk(tree): statement = "" if is_cgat_statement(node) or \ is_cgat_executable(node) or \ is_cgat_executable_name(node) or \ is_cgat_cmd(node): statement = get_cmd_string(node) elif is_cgat_append(node): statement = get_append_string(node) if len(statement) > 0 and not statement.startswith(' -'): #print(statement) statement = cleanup_statement(statement) statements.append(statement) # dictionary where: # key = program name # value = number of times it has been called deps = {} # set of names that are not proper deps exceptions = ['create', 'drop', 'select', 'attach', 'insert', 'module', '%s', 'tool', 'cmd-farm', 'cmd-sql', 'cmd_extract', 'cmds', 'compress', 'conda_env', 'filter_cmd', 'load_statement', 'match_executable', 'rscript', 'paired_processing', 'executable', 'transform', 'uncompress', 'execglam2', 'execglam2scan', 'checkpoint', 'for'] for statement in statements: # use bashlex to parse statements commands = [] try: #print(statement) parts = bashlex.parse(statement) get_cmd_names(parts[0], commands) except bashlex.errors.ParsingError: pass for command in commands: #print(command) if command.lower() not in exceptions: if command not in deps: deps[command] = 1 else: deps[command] += 1 # list of unmet dependencies check_path_failures = [] # print dictionary ordered by value for k in sorted(deps, key=deps.get, reverse=True): if shutil.which(k) is None: check_path_failures.append(k) return deps, check_path_failures
# try: # if (parts[0].parts[i].word) == 'rm' or (parts[0].parts[i].word) == 'unlinlk': # print(bashCommand) # # i = i + 1 # except IndexError: # break # print(type(parts)) # print(parts[0]) # print(parts) # import json # print(json.load(open('commands.json',mode='r'))) # Finding Malacious commands # import bashlex # to check if a command is trying to update a temp directory. parts = bashlex.parse('while true; do echo \'Ctrl c to kill\'; sleep 1; done') # parts = bashlex.parse('ls > /tmp/foo') # print(parts[0].dump()) # print(parts[0].parts[2].type) # # print(parts[0].parts[2].word) # print(parts[0].parts[2].output.word) print(parts[0].dump()) print(parts[0].list[0]) # s = '/tmp' # if s in '/tmp/foo': # print("hey")
# %% # pip install bashlex import bashlex parts = bashlex.parse('true && cat <(echo $(echo foo))') for ast in parts: print(ast.dump())
def parse(cmd): parts = bashlex.parse(cmd) return parseNode(parts[0], None)
def parsefile(filepath, fff): env_dict.clear() parsed_df = parseDockerfile(open(filepath, encoding="UTF-8").read()) for dic in parsed_df: if dic.get("ENV") != None: s = dic["ENV"] if len(s.split("=")) > 1: sp_eq = s.split("=") s = "" for i in range(len(sp_eq)): if i == len(sp_eq) - 1: s += sp_eq[i] else: s += sp_eq[i] + " = " lexes = s.split() ii = 0 while ii < len(lexes): # print(lexes[ii]) if lexes[ii] == "=": r = "" jj = ii + 1 while jj < len(lexes) and lexes[jj] != "=": if jj + 1 < len(lexes) and lexes[jj + 1] == "=": break r += (" " + lexes[jj]) jj += 1 print("Inserting to Env dict: " + lexes[ii - 1] + " " + r.strip('"\'')) env_dict[lexes[ii - 1]] = r.strip('"').strip("'") ii += 1 else: if len(s.split()) > 1: if len(s.split()) == 2: if len(s.split()[0].split("=")) > 1 and len( s.split()[1].split("=")) > 1: for ss in s.split(): print("Inserting to Env dict: " + ss.split("=")[0] + " " + ss.split("=")[1]) env_dict[ss.split("=")[0].strip()] = ss.split( "=")[1].strip() else: print("Inserting to Env dict: " + dic["ENV"].split()[0] + " " + dic["ENV"].split()[1]) env_dict[dic["ENV"].split() [0]] = dic["ENV"].split()[1] else: ss = "" for i in range(1, len(s.split())): ss += s.split()[i] + " " print("Inserting to Env dict: " + s.split()[0] + " " + ss.strip()) env_dict[s.split()[0]] = ss.strip() else: if len(dic["ENV"].split("=")) > 1: print("Inserting to Env dict: " + dic["ENV"].split("=")[0] + " " + dic["ENV"].split("=")[1]) env_dict[dic["ENV"].split("=")[0].strip( )] = dic["ENV"].split("=")[1].strip() if dic.get("RUN") != None: try: parts = bashlex.parse(dic["RUN"]) positions = [] for ast in parts: visitor = nodevisitor(positions, fff) visitor.visit(ast) except BaseException as e: print(e)
def parse(self, code, **kwargs): # dump parse tree and # use it to dynamically update this AstModule return self.load(dump_bash(bashlex.parse(code)))
def value_to_str(ast: Union[AST, List[AST]]) -> Optional[str]: try: if isinstance(ast, Node): # Node n = ast.type_name if n == "Operator": return value_to_str(ast.fields[0].value) elif n == "List": elems = [ value_to_str(p) for p in cast(list, ast.fields[0].value) ] if None in set(elems): return None return "".join( [token for token in elems if token is not None]) elif n == "Pipe": return value_to_str(ast.fields[0].value) elif n == "Pipeline": elems = [ value_to_str(p) for p in cast(list, ast.fields[0].value) ] if None in set(elems): return None return "".join( [token for token in elems if token is not None]) elif n == "Compound": elems = [ value_to_str(p) for p in cast(list, ast.fields[0].value) ] if None in set(elems): return None body = "".join( [token for token in elems if token is not None]) elems = [ value_to_str(p) for p in cast(list, ast.fields[1].value) ] if None in set(elems): return None redirects = \ "".join( [token for token in elems if token is not None] ) return f"{body} {redirects}" elif n == "If": # TODO deal with newline elems = [ value_to_str(p) for p in cast(list, ast.fields[0].value) ] if None in set(elems): return None return \ " ".join( [token for token in elems if token is not None] ) elif n == "For": # TODO deal with newline elems = [ value_to_str(p) for p in cast(list, ast.fields[0].value) ] if None in set(elems): return None return "".join( [token for token in elems if token is not None]) elif n == "While": # TODO deal with newline elems = [ value_to_str(p) for p in cast(list, ast.fields[0].value) ] if None in set(elems): return None return \ " ".join( [token for token in elems if token is not None] ) elif n == "Until": # TODO deal with newline elems = [ value_to_str(p) for p in cast(list, ast.fields[0].value) ] if None in set(elems): return None return \ " ".join( [token for token in elems if token is not None] ) elif n == "Command": elems = [ value_to_str(p) for p in cast(list, ast.fields[0].value) ] if None in set(elems): return None return \ " ".join( [token for token in elems if token is not None] ) elif n == "Function": elems = [ value_to_str(p) for p in cast(list, ast.fields[1].value) ] if None in set(elems): return None elems = [token for token in elems if token is not None] body = "".join( [token for token in elems if token is not None]) name = value_to_str(cast(AST, ast.fields[0].value)) if name is None: return None return f"function {name}()" + "{" + body + "}" elif n == "Literal": return value_to_str(ast.fields[0].value) elif n == "Word": elems = [ value_to_str(p) for p in cast(list, ast.fields[0].value) ] if None in set(elems): return None return "".join( [token for token in elems if token is not None]) elif n == "Assign": elems = [ value_to_str(p) for p in cast(list, ast.fields[0].value) ] if None in set(elems): return None return "".join( [token for token in elems if token is not None]) elif n == "ReservedWord": return value_to_str(ast.fields[0].value) elif n == "Parameter": p = value_to_str(ast.fields[0].value) if p is None: return None return "${" + p + "}" elif n == "Tilde": return value_to_str(ast.fields[0].value) elif n == "Redirect": t = value_to_str(ast.fields[0].value) if t is None: return None if cast(AST, ast.fields[1].value).get_type_name() != "None": heredoc = value_to_str(ast.fields[1].value) else: heredoc = "" if heredoc is None: return None if cast(AST, ast.fields[2].value).get_type_name() != "None": input = value_to_str(ast.fields[2].value) else: input = "" if input is None: return None if cast(AST, ast.fields[3].value).get_type_name() != "None": output = value_to_str(ast.fields[3].value) else: output = "" if output is None: return None value = f"{input}{t}{output}" if heredoc != "": value = f"{value}\n{heredoc}" return value elif n == "Heredoc": return value_to_str(ast.fields[0].value) elif n == "ProcessSubstitution": command = value_to_str(ast.fields[0].value) t = value_to_str(ast.fields[1].value) if command is None or t is None: return None return f"{t}({command})" elif n == "CommandSubstitution": command = value_to_str(ast.fields[0].value) if command is None: return None try: bashlex.parse(f"$({command})") return f"$({command})" except: # noqa return f"`{command}`" elif n == "None": return "" else: assert (False) elif isinstance(ast, Leaf): # Token return cast(str, ast.value) elif isinstance(ast, list): return "".join(map(lambda x: str(value_to_str(x)), ast)) except: # noqa pass return None