def test_branches(program, df, infile, args): # get labels: df_l = get_labeled(df) _, ext = os.path.splitext(infile) tmpfile = 'input' + ext df_l_g = None work_dir = '/dev/shm/' # if program == 'jpeg': # work_dir = '/tmp/' infile = os.path.abspath(infile) fsize = os.path.getsize(infile) cwd = os.getcwd() with tempfile.TemporaryDirectory(prefix=work_dir) as run_dir: os.chdir(run_dir) # get trace: print(infile, tmpfile) run(['cp', infile, tmpfile]) # run([program, "taint", '1000000', tmpfile]) env = os.environ.copy() env['DFSAN_OPTIONS'] = "func_logfile='',always_record_branches=1" # turn off func recording print(programs.get_cmd(program, programs.GRAD, tmpfile)) res = run(programs.get_cmd(program, programs.GRAD, tmpfile), env=env, stdout=DEVNULL) if res.returncode: print('ERROR RUNNING BASELINE:', res.returncode) print(res.stderr.decode('utf-8', 'ignore')) # baseline_trace_df = pd.read_csv('branches.csv') baseline_trace = pd.read_csv('branches.csv') result = [] df_l_g = df_l.groupby(['file_id', 'inst_id']).agg(lambda x: tuple(x)) for (f_id, b_id), row in tqdm(df_l_g.iterrows(), total=df_l_g.shape[0]): # test_bytes = row.deriv_byte # if args.all_bytes: test_bytes = list(range(fsize)) # print(test_bytes, row) # print(len(test_bytes), len(fsize)) valid_bytes = test_branch(program, f_id, b_id, baseline_trace, test_bytes, infile) result.append(valid_bytes) df_l_g['valid_bytes'] = pd.Series(result, df_l_g.index) os.chdir(cwd) return df_l_g
def test_branch(program, f_id, b_id, orig_trace, byte_inds, infile): changes = [0, 255] + [1,2,4,8,16,32,64,128]# + list(range(1,254)) valid_byte_inds = [] orig_trace_b = orig_trace[(orig_trace.file_id == f_id) & (orig_trace.inst_id == b_id)] _, ext = os.path.splitext(infile) tmpfile = 'input' + ext # for byte index orig_blob_data = None with open(infile, 'rb') as tf: orig_blob_data = bytearray(tf.read()) for byte_ind in byte_inds: # for 0, 255, other changes? found = False for change in changes: # read and modify template, write output blob_data = bytearray(orig_blob_data) if change in [0, 255]: blob_data[byte_ind] = change else: blob_data[byte_ind] = blob_data[byte_ind] ^ change with open(tmpfile, 'wb') as outfile: outfile.write(blob_data) # run([program, "taint", '0', tmpfile]) env = os.environ.copy() env['DFSAN_OPTIONS'] = "func_logfile='',always_record_branches=1" # turn off func recording run(programs.get_cmd(program, programs.GRAD, tmpfile), env=env, stdout=DEVNULL, stderr=DEVNULL) # load branches, check if any branch values changed trace = pd.read_csv('branches.csv') trace_b = trace[(trace.file_id == f_id) &(trace.inst_id == b_id)] for i in range((len(trace_b))): if i < len(orig_trace_b): row = trace_b.iloc[i] origrow = orig_trace_b.iloc[i] if (row.lhs_val != origrow.lhs_val or row.rhs_val != origrow.rhs_val): valid_byte_inds.append(byte_ind) found = True break if found: break return valid_byte_inds
def test_bytes(program, infile, working_dir='/tmp/'): cwd = os.getcwd() print(infile) infile = os.path.abspath(infile) print(infile) df_actual = pd.DataFrame() with tempfile.TemporaryDirectory(prefix=working_dir) as run_dir: os.chdir(run_dir) _, ext = os.path.splitext(infile) tmpfile = 'input' + ext # get trace: run(['cp', infile, tmpfile]) result = [] infile_len = os.path.getsize(infile) print('evaluating input bytes for', infile) for byte_ind in tqdm(range(infile_len)): cmd = programs.get_cmd(program, programs.TAINT, infile, byte_ind) run(cmd, stdout=DEVNULL, stderr=DEVNULL) baseline_trace = pd.read_csv('branches.csv') actual_branches = test_byte(program, byte_ind, baseline_trace, infile) result.extend(actual_branches) df_actual = pd.DataFrame(result) df_actual.columns = ['deriv_byte', 'file_id', 'inst_id', 'actual'] os.chdir(cwd) return df_actual
def test_byte(program, byte_ind, orig_trace, infile): changes = [0, 255] + [1,2,4,8,16,32,64,128]# + list(range(1,254)) actual_branches = [] test_branches = [ind for ind, _ in orig_trace.groupby(['file_id', 'inst_id']).agg(lambda x:()).iterrows()] _, ext = os.path.splitext(infile) tmpfile = 'input' + ext for change in changes: with open(infile, 'rb') as tf: blob_data = bytearray(tf.read()) if change in [0, 255]: blob_data[byte_ind] = change else: blob_data[byte_ind] = blob_data[byte_ind] ^ change with open(tmpfile, 'wb') as outfile: outfile.write(blob_data) # run([program, "taint", '0', tmpfile], stdout=DEVNULL, stderr=DEVNULL) env = os.environ.copy() env['DFSAN_OPTIONS'] = "func_logfile=''" # turn off func recording # print(programs.get_cmd(program, programs.TAINT, byte_ind)) run(programs.get_cmd(program, programs.TAINT, tmpfile, byte_ind), env=env, stdout=DEVNULL, stderr=DEVNULL) # load branches, check if any branch values changed try: trace = pd.read_csv('branches.csv') result = {} for (f_id, b_id) in test_branches: if (f_id, b_id) in actual_branches: continue # already found this branch orig_trace_b = get_branch(orig_trace, f_id, b_id) trace_b = get_branch(trace, f_id, b_id) for i in range((len(trace_b))): if i < len(orig_trace_b): row = trace_b.iloc[i] origrow = orig_trace_b.iloc[i] if (row.lhs_val != origrow.lhs_val or row.rhs_val != origrow.rhs_val): actual_branches.append((f_id, b_id)) break except Exception as e: print('Branch check error!', e) actual_branches = [(byte_ind, f_id, b_id, 1) for (f_id, b_id) in actual_branches] return actual_branches
def eval_file(program, infile, mode, label=True, opt=True): env = os.environ.copy() env['GR_MODE_PERF'] = '1' if opt: env['DFSAN_OPTIONS'] = "reuse_labels=0" _, ext = os.path.splitext(infile) tmpfile = 'input'+ext run(['cp', infile, tmpfile]) cmd = programs.get_cmd(program, mode, infile, 0) if mode in [programs.TAINT, programs.GRAD]: if label: cmd[-2] = "$i" else: del cmd[-2] del cmd[-2] # del cmd[-2] # del cmd[-2] cmd = " ".join(cmd) size = os.path.getsize(infile) iters = min(size-1, MAX_ITER) print(infile, iters) try: fullcmd = "for i in {0.."+str(iters)+"}; do "+cmd+">/dev/null; done;" if (iters < 1000): fullcmd = "for j in {0..5}; do " + fullcmd + "done;" iters = iters*5 nsamples = 5 else: nsamples = 2 # fullcmd = cmd # fullcmd = "for i in {0.."+str(iters)+"}; do "+cmd+"; done;" print(fullcmd) total_time = 0.0 for i in range(nsamples): # prep any cache run_res = run(cmd, shell=True, env=env, executable='/bin/bash', capture_output=True) start = time.time() run_res = run(fullcmd, shell=True, env=env, executable='/bin/bash', capture_output=True) elapsed = time.time() - start total_time += elapsed if run_res.returncode: print('Error',str(run_res.returncode)+':', " ".join(programs.get_cmd(program, mode, infile))) # print('stdout:', run_res.stdout.decode('utf-8', errors='ignore')) print('stderr:', run_res.stderr.decode('utf-8', errors='ignore')) except Exception as e: print('Error:', " ".join(programs.get_cmd(program, mode, infile))) print(e) return 0 # sanity check we're not recording # assert(not (os.path.exists('branches.csv'))) # assert(not (os.path.exists('func_args.csv'))) meantime = total_time / nsamples * 1000.0 # convert to ms per_byte_time = meantime / iters return per_byte_time