def abstract_code(code, build_dir=None, build_src_path=None): try: c_tokenizer = CTokenizer(build_dir=build_dir, build_src_path=build_src_path) tokens = c_tokenizer.tokenize(code) except Exception: c_tokenizer = CTokenizer() tokens = c_tokenizer.tokenize(code) return c_tokenizer.detokenize(tokens)
def abstract_code(code, build_dir=None, build_src_path=None): try: c_tokenizer = CTokenizer(build_dir=build_dir, build_src_path=build_src_path) tokens = c_tokenizer.tokenize(code) except Exception: c_tokenizer = CTokenizer() tokens = c_tokenizer.tokenize(code) return c_tokenizer.detokenize(tokens) #INPUT_DIR = "/dev/shm/output/abstracted_decompiled_code" #OUTPUT_DIR = "/dev/shm/output/abstracted_base64_decompiled_code" INPUT_DIR = "/dev/shm/output/abstracted_src_code" OUTPUT_DIR = "/dev/shm/output/abstracted_base64_src_code" os.system("mkdir -p %s" % OUTPUT_DIR) for fname in tqdm.tqdm(os.listdir(INPUT_DIR), smoothing=0, dynamic_ncols=True): fpath = os.path.join(INPUT_DIR, fname) fid = int(fname.split(".")[0]) with open(fpath) as f: code = f.read() c_tokenizer = CTokenizer() tokens = c_tokenizer.tokenize(code) encoded_tokens = [base64.b64encode(x.encode()).decode() for x in tokens] output_fpath = os.path.join(OUTPUT_DIR, fname) with open(output_fpath, 'w') as f: f.write(" ".join(encoded_tokens))
default='', help='The binary to tokenize.') parser.add_argument('-n', '--func-name', default='', help='The function name in the binary to tokenize.') args = parser.parse_args() sanitize_input(args) # choose a proper tokenizer if args.input_file: with io.open(args.input_file, encoding='utf-8') as f: source = f.read() tokenizer = CTokenizer(build_dir=args.build_dir, build_src_path=args.build_src_path) tokens = tokenizer.tokenize(source) print(source) print("------") elif args.input_bin: tokenizer = IDATokenizer() tokens = tokenizer.tokenize(args.input_bin, args.func_name) else: raise print(tokens) print("------") new_code = tokenizer.detokenize(tokens) print(new_code)
#INPUT_DIR = "/dev/shm/output/abstracted_decompiled_code" #OUTPUT_DIR = "/dev/shm/output/abstracted_base64_decompiled_code" INPUT_DIR_BASE = "./full_base64" OUTPUT_DIR_BASE = "./full_base64_decoded" for sub_dir in ["sinput", "abstracted_base64_src_code", "result"]: input_dir = os.path.join(INPUT_DIR_BASE, sub_dir) output_dir = os.path.join(OUTPUT_DIR_BASE, sub_dir) os.system("mkdir -p %s" % input_dir) os.system("mkdir -p %s" % output_dir) for fname in tqdm.tqdm(os.listdir(input_dir), smoothing=0, dynamic_ncols=True): fpath = os.path.join(input_dir, fname) fid = int(fname.split(".")[0]) with open(fpath) as f: code = f.read() tokens = [] for x in code.split(): try: y = base64.b64decode(x).decode('utf8', 'ignore') except Exception: y = '<unk>' tokens.append(y) c_tokenizer = CTokenizer() output_code = c_tokenizer.detokenize(tokens) output_fpath = os.path.join(output_dir, fname) with open(output_fpath, 'w') as f: f.write(output_code)