def compare_histograms(self): b1_histograms = set(self.b1.histogram_files()) b2_histograms = set(self.b2.histogram_files()) common_histograms = b1_histograms.intersection(b2_histograms) out = self.out if self.mode=='html': out.write("<ul>\n") for histogram_file in sorted(histogram_files): out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file,histogram_file)) out.write("</ul>\n<hr/>\n") for histogram_file in sorted(common_histograms): diffcount = 0 if args.html: out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file,histogram_file)) t = ttable.ttable() t.set_col_alignment(0,t.RIGHT) t.set_col_alignment(1,t.RIGHT) t.set_col_alignment(2,t.RIGHT) t.set_col_alignment(3,t.LEFT) t.set_title(histogram_file) t.append_head(['# in PRE','# in POST','∆','Value']) (b1,b2) = self.getab() b1.hist = b1.read_histogram(histogram_file) b2.hist = b2.read_histogram(histogram_file) b1.keys = set(b1.hist.keys()) b2.keys = set(b2.hist.keys()) # Create the output, then we will sort on col 1, 2 and 4 data = [] for feature in b1.keys.union(b2.keys): v1 = b1.hist.get(feature,0) v2 = b2.hist.get(feature,0) if v1!=v2: diffcount += 1 if v2>v1 or (v2==v1 and args.same) or (v2<v1 and args.smaller): data.append((v1, v2, v2-v1, feature.decode('utf-8'))) # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value def mysortkey(a): return (-a[2],a[3],a[1],a[0]) if data: for row in sorted(data,key=mysortkey): t.append_data(row) out.write(t.typeset(mode=self.mode)) if diffcount==0 and args.both: if args.html: out.write("{}: No differences\n".format(histogram_file)) else: out.write("{}: No differences\n".format(histogram_file))
def summary(self): (a,b) = self.getab() t = ttable.ttable() t.append_head(['file',a.name,b.name]) all_files = a.files.union(b.files) for fn in sorted(all_files): if fn in a.files: adata = len(list(a.read_features(fn))) else: adata = 'n/a' if fn in b.files: bdata = len(list(b.read_features(fn))) else: bdata = 'n/a' if adata != bdata: t.append_data([fn, adata, bdata]) self.out.write(t.typeset(mode=self.mode))
def print_data(): print("Data directory:", datadir()) t = ttable() t.append_head(["FTYPE", "Files", "min blks", "max blks", "avg blks", "total"]) t.set_col_alignment(1, t.RIGHT) t.set_col_alignment(2, t.RIGHT) t.set_col_alignment(3, t.RIGHT) t.set_col_alignment(4, t.RIGHT) t.set_col_alignment(5, t.RIGHT) blocks_per_type = {} for ftype in filetypes(): if sceadan_type_for_name(ftype) == -1: raise RuntimeError("file type {} is invalid".format(ftype)) blocks = [os.path.getsize(fn) // args.train_blocksize for fn in ftype_files(ftype)] blocks = list(filter(lambda v: v > 0, blocks)) t.append_data((ftype, len(blocks), min(blocks), max(blocks), sum(blocks) / len(blocks), sum(blocks))) blocks_per_type[ftype] = sum(blocks) print(t.typeset(mode="text"))
def print_data(): print("Data directory:",datadir()) t = ttable() t.append_head(["FTYPE","Files","min blks","max blks","avg blks","total"]) t.set_col_alignment(1,t.RIGHT) t.set_col_alignment(2,t.RIGHT) t.set_col_alignment(3,t.RIGHT) t.set_col_alignment(4,t.RIGHT) t.set_col_alignment(5,t.RIGHT) blocks_per_type = {} for ftype in filetypes(): if sceadan_type_for_name(ftype) == -1: raise RuntimeError("file type {} is invalid".format(ftype)) blocks = [os.path.getsize(fn)//args.train_blocksize for fn in ftype_files(ftype)] blocks = list(filter(lambda v:v>0,blocks)) t.append_data((ftype,len(blocks),min(blocks),max(blocks),sum(blocks)/len(blocks),sum(blocks))) blocks_per_type[ftype] = sum(blocks) print(t.typeset(mode='text'))
def compare_files(self): out = self.out if self.both: t = ttable.ttable() t.append_data(['bulk_diff.py Version:',__version__]) t.append_data(['PRE Image:',self.b1.image_filename()]) t.append_data(['POST Image:',self.b2.image_filename()]) out.write(t.typeset(mode=self.mode)) for i in [1,2]: (a,b) = self.getab(i) r = a.files.difference(b.files) total_diff = sum([a.count_lines(f) for f in r if ".txt" in f]) total_other = sum(1 for f in r if ".txt" not in f) if total_diff>0 or total_other>0 or args.both: print("Files only in {}:".format(a.name), file=out) for f in r: if ".txt" in f: print(" %s (%d lines)" % (f,a.count_lines(f)), file=out) else: print(" %s" % (f), file=out)
def compare(): comp = shelve.open(compname("experiment"),writeback=False) if not 'RESULTS' in comp: print("Compare fail: %s did not store its scores in the database" %(args.compare)) return print("\nAccuracy comparison between %s and %s" %(args.exp, args.compare)) t = ttable() t.append_head(['type',args.compare,args.exp,' % change']) t.set_col_alignment(1,t.CENTER) t.set_col_alignment(2,t.CENTER) t.set_col_alignment(3,t.CENTER) expScore = {} compScore = {} for key, value in db['RESULTS'].items(): temp = value.copy() expTally = temp['TALLY'] count = sum(expTally.values()) expScore[key] = int(expTally.get(key,0)*100/count) for key, value in comp['RESULTS'].items(): temp = value.copy() compTally = temp['TALLY'] count = sum(compTally.values()) compScore[key] = int(compTally.get(key,0)*100/count) for key, value in sorted(expScore.items()): if key in compScore: data = [key, compScore[key], expScore[key]] + ["{:3.2f}%".format((expScore[key]-compScore[key])/float(compScore[key])*100)] else: data = [key,'N/A', compScore[key], 'N/A'] t.append_data(data) print(t.typeset(mode='text'))
1.05) # make it a little bigger than needed plt.ylim(-5, 105) plt.legend(loc='lower center', fontsize=9) plt.xticks(rotation=20) plt.axes().get_xaxis().set_major_formatter( mp.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) plt.savefig(fname) if __name__ == "__main__": # # Write the data to a LaTeX file # topcode = 100000 tab = ttable() tab.append_head(['parental income', 'grade']) tab.append_data(ttable.HR) for row in zip(income, grades): tab.append_data(row) tab.save_table("toy_regression_data.tex", mode='latex') # First show the regression with the real dat e = Stats(income, grades) e.plot_data() e.regress() e.savefig("toy_regression.pdf") true_fit = copy.deepcopy(e.fit) # Now show the 4 regressions with a point the explores the state space
def process(out,dname1,dname2): mode = 'text' if options.html: mode='html' b1 = bulk_extractor_reader.BulkReport(dname1) b2 = bulk_extractor_reader.BulkReport(dname2) t = ttable.ttable() t.append_data(['bulk_diff.py Version:',bulk_diff_version]) t.append_data(['PRE Image:',b1.image_filename()]) t.append_data(['POST Image:',b2.image_filename()]) out.write(t.typeset(mode=mode)) for i in [1,2]: if i==1: a=b1;b=b2 else: b=b1;a=b2; r = a.files.difference(b.files) if r: print("Files only in {}:".format(a.name)) for f in r: if ".txt" in f: print(" %s (%d lines)" % (f,a.count_lines(f))) else: print(" %s" % (f)) # Report interesting differences based on the historgrams. # Output Example: """ # in PRE # in POST ∆ Feature 10 20 10 [email protected] 8 17 9 [email protected] 11 16 5 [email protected] """ b1_histograms = set(b1.histogram_files()) b2_histograms = set(b2.histogram_files()) common_histograms = b1_histograms.intersection(b2_histograms) if options.html: out.write("<ul>\n") for histogram_file in sorted(histogram_files): out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file,histogram_file)) out.write("</ul>\n<hr/>\n") for histogram_file in sorted(common_histograms): diffcount = 0 if options.html: out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file,histogram_file)) else: out.write('\n') t = ttable.ttable() t.set_col_alignment(0,t.RIGHT) t.set_col_alignment(1,t.RIGHT) t.set_col_alignment(2,t.RIGHT) t.set_col_alignment(3,t.LEFT) t.set_title(histogram_file) t.append_head(['# in PRE','# in POST','∆','Value']) b1.hist = b1.read_histogram(histogram_file) b2.hist = b2.read_histogram(histogram_file) b1.keys = set(b1.hist.keys()) b2.keys = set(b2.hist.keys()) # Create the output, then we will sort on col 1, 2 and 4 data = [] for feature in b1.keys.union(b2.keys): v1 = b1.hist.get(feature,0) v2 = b2.hist.get(feature,0) if v1!=v2: diffcount += 1 if v2>v1 or (v2==v1 and options.same) or (v2<v1 and options.smaller): data.append((v1, v2, v2-v1, feature.decode('utf-8'))) # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value def mysortkey(a): return (-a[2],a[3],a[1],a[0]) if data: for row in sorted(data,key=mysortkey): t.append_data(row) out.write(t.typeset(mode=mode)) if diffcount==0: if options.html: out.write("{}: No differences\n".format(histogram_file)) else: out.write("{}: No differences\n".format(histogram_file)) if options.features: for feature_file in b1.feature_files(): if feature_file not in b2.feature_files(): continue print("Compare features",feature_file) for p in [1,2]: if p==1: a = b1; b = b2 else: a = b2; b = a a_features = {} for line in a.open(feature_file): r = bulk_extractor_reader.parse_feature_line(line) if not r: continue a_features[r[0]] = r[1] for line in b.open(feature_file): r = bulk_extractor_reader.parse_feature_line(line) if not r: continue if r[0] not in a_features: print("{} {} is only in {}".format(r[0],r[1],a.name))
files += glob.glob(fn) else: files += [fn] drive_encoding_counts = {} for fn in files: print("") d = Drive(fn) d.process() for ff in d.f_encoding_counts: if ff not in drive_encoding_counts: drive_encoding_counts[ff] = defaultdict(statbag) for encoding in d.f_encoding_counts[ff]: drive_encoding_counts[ff][encoding].addx(d.f_encoding_counts[ff][encoding]) # Now that the data have been collected, typeset the big table t = ttable.ttable() t.latex_colspec = "lrrrrr" t.append_head(('', 'Drives with','Feature', 'avg', 'max', '')) t.append_head(('Feature / Coding','coding' ,'Count','per drive','per drive','$\\sigma$')) t.set_col_alignment(1,t.LEFT) t.set_col_alignment(2,t.RIGHT) t.set_col_alignment(3,t.RIGHT) t.set_col_alignment(4,t.RIGHT) t.set_col_alignment(5,t.RIGHT) rep = [] # report will be sorted by second column print("\n"*4) for ff in sorted(drive_encoding_counts.keys()): for enc in sorted(drive_encoding_counts[ff].keys()): k = ff + " / " + str(enc)
def process(out, dname1, dname2): mode = 'text' if options.html: mode = 'html' b1 = bulk_extractor_reader.BulkReport(dname1) b2 = bulk_extractor_reader.BulkReport(dname2) t = ttable.ttable() t.append_data(['bulk_diff.py Version:', bulk_diff_version]) t.append_data(['PRE Image:', b1.image_filename()]) t.append_data(['POST Image:', b2.image_filename()]) out.write(t.typeset(mode=mode)) if b1.files.difference(b2.files): print("Files only in %s:\n %s" % (b1.name, " ".join(b1.files.difference(b2.files)))) if b2.files.difference(b1.files): print("Files only in %s:\n %s" % (b2.name, " ".join(b2.files.difference(b1.files)))) # Report interesting differences based on the historgrams. # Output Example: """ # in PRE # in POST ∆ Feature 10 20 10 [email protected] 8 17 9 [email protected] 11 16 5 [email protected] """ common_files = b1.files.intersection(b2.files) histogram_files = filter(lambda a: "histogram" in a, common_files) if options.html: out.write("<ul>\n") for histogram_file in sorted(histogram_files): out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file, histogram_file)) out.write("</ul>\n<hr/>\n") diffcount = 0 for histogram_file in sorted(histogram_files): if options.html: out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file, histogram_file)) else: out.write('\n') t = ttable.ttable() t.set_col_alignment(0, t.RIGHT) t.set_col_alignment(1, t.RIGHT) t.set_col_alignment(2, t.RIGHT) t.set_col_alignment(3, t.LEFT) t.set_title(histogram_file) t.append_head(['# in PRE', '# in POST', '∆', 'Value']) b1.hist = b1.read_histogram(histogram_file) b2.hist = b2.read_histogram(histogram_file) b1.keys = set(b1.hist.keys()) b2.keys = set(b2.hist.keys()) # Create the output, then we will sort on col 1, 2 and 4 data = [] for feature in b1.keys.union(b2.keys): v1 = b1.hist.get(feature, 0) v2 = b2.hist.get(feature, 0) if v1 != v2: diffcount += 1 if v2 > v1 or (v2 == v1 and options.same) or (v2 < v1 and options.smaller): data.append((v1, v2, v2 - v1, feature)) # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value def mysortkey(a): return (-a[2], a[3], a[1], a[0]) if data: for row in sorted(data, key=mysortkey): t.append_data(row) out.write(t.typeset(mode=mode)) if diffcount == 0: if options.html: out.write("{}: No differences\n\n".format(histogram_file)) else: out.write("{}: No differences\n\n".format(histogram_file))
def generate_confusion(): if os.path.exists(expname("confusion.txt")): print("Confusion matrix already exists") return if not os.path.exists(model_file()): raise RuntimeError("Cannot generate confusion matrix. Model file {} does not exist".format(model_file())) print("Generating confusion matrix") t = ttable() db["test_blocksize"] = args.test_blocksize # t.append_head(['File Type','Classifies as']) # Okay. Get these in a row with a threadpool if args.j > 1: import multiprocessing pool = multiprocessing.Pool(args.j) else: import multiprocessing.dummy pool = multiprocessing.dummy.Pool(1) sceadan_score_rows = {} files_per_type = {} blocks_per_type = {} classified_types = set() for (ftype, tally, file_count, block_count) in pool.imap_unordered(get_sceadan_score_for_filetype, filetypes()): sceadan_score_rows[ftype] = tally classified_types = classified_types.union(set(tally.keys())) files_per_type[ftype] = file_count blocks_per_type[ftype] = block_count classtypes = [t.upper() for t in sorted(classified_types)] # Get a list of all the classified types # And calculate the precision and recall t.append_head([" ", "file ", "block"]) t.append_head(["type", "count", "count"] + classtypes) t.set_col_alignment(2, t.RIGHT) t.set_col_alignment(3, t.RIGHT) total_events = 0 total_correct = 0 percent_correct_sum = 0 rowcounter = 0 for ftype in filetypes(): FTYPE = ftype.upper() tally = sceadan_score_rows[ftype] count = sum(tally.values()) total_events += count total_correct += tally.get(ftype.upper(), 0) percent_correct = tally.get(ftype.upper(), 0) * 100.0 / count percent_correct_sum += percent_correct # Generate each row of the output data = [FTYPE, files_per_type[FTYPE], blocks_per_type[FTYPE]] + [ "{:3.0f}".format(tally.get(f, 0) * 100.0 / count) for f in classtypes ] t.append_data(data) rowcounter += 1 if rowcounter % 5 == 0: t.append_data([""]) f = openexp("confusion.txt", "w") f.write(t.typeset(mode="text")) print(t.typeset(mode="text")) db["overall_accuracy"] = (total_correct * 100.0) / total_events db["average_accuracy_per_class"] = percent_correct_sum / len(filetypes()) def info(n): """Print a value from the database and print in confusion.txt""" try: v = str(db[n])[0:120] val = "{}: {}".format(n.replace("_", " "), v) f.write(val + "\n") print(val) except KeyError as e: f.write("key {} not found\n".format(n)) print("Keys in database:", list(db.keys())) info("train_blocksize") info("test_blocksize") info("liblinear_train_command") info("overall_accuracy") info("average_accuracy_per_class")
def process(out,dname1,dname2): mode = 'text' if options.html: mode='html' b1 = bulk_extractor_reader.BulkReport(dname1) b2 = bulk_extractor_reader.BulkReport(dname2) t = ttable.ttable() t.append_data(['bulk_diff.py Version:',bulk_diff_version]) t.append_data(['PRE Image:',b1.imagefile()]) t.append_data(['POST Image:',b2.imagefile()]) out.write(t.typeset(mode=mode)) if b1.files.difference(b2.files): print("Files only in %s:\n %s" % (b1.name," ".join(b1.files.difference(b2.files)))) if b2.files.difference(b1.files): print("Files only in %s:\n %s" % (b2.name," ".join(b2.files.difference(b1.files)))) # Report interesting differences based on the historgrams. # Output Example: """ # in PRE # in POST ∆ Feature 10 20 10 [email protected] 8 17 9 [email protected] 11 16 5 [email protected] """ common_files = b1.files.intersection(b2.files) histogram_files = filter(lambda a:"histogram" in a,common_files) if options.html: out.write("<ul>\n") for histogram_file in sorted(histogram_files): out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file,histogram_file)) out.write("</ul>\n<hr/>\n") diffcount = 0 for histogram_file in sorted(histogram_files): if options.html: out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file,histogram_file)) else: out.write('\n') t = ttable.ttable() t.set_col_alignment(0,t.RIGHT) t.set_col_alignment(1,t.RIGHT) t.set_col_alignment(2,t.RIGHT) t.set_col_alignment(3,t.LEFT) t.set_title(histogram_file) t.append_head(['# in PRE','# in POST','∆','Value']) b1.hist = b1.read_histogram(histogram_file) b2.hist = b2.read_histogram(histogram_file) b1.keys = set(b1.hist.keys()) b2.keys = set(b2.hist.keys()) # Create the output, then we will sort on col 1, 2 and 4 data = [] for feature in b1.keys.union(b2.keys): v1 = b1.hist.get(feature,0) v2 = b2.hist.get(feature,0) if v1!=v2: diffcount += 1 if v2<=v1 and not options.smaller: continue data.append((v1, v2, v2-v1, feature)) # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value def mysortkey(a): return (-a[2],a[3],a[1],a[0]) if data: for row in sorted(data,key=mysortkey): t.append_data(row) out.write(t.typeset(mode=mode)) if diffcount==0: if options.html: out.write("{}: No differences\n\n".format(histogram_file)) else: out.write("{}: No differences\n\n".format(histogram_file))
def process_files(fn): drive_files = {} # index of drives all_parts = [] all_files = [] files_by_md5 = {} # a dictionary of sets of fiobject, indexed by md5 extension_len_histogram = histogram2d() extension_fragments_histogram = histogram2d() partition_histogram = histogram2d() def cb(fi): # add the md5 to the set if fi.is_file() and fi.filesize(): files_by_md5.get(fi.md5,set()).add(fi) ext = fi.ext() if not ext: print(fi.meta_type(),fi) extension_len_histogram.add(ext,fi.filesize()) extension_fragments_histogram.add(ext,fi.fragments()) partition_histogram.add(fi.partition(),fi.filesize()) if fn.endswith('xml'): fiwalk.fiwalk_using_sax(xmlfile=open(fn),callback=cb) else: fiwalk.fiwalk_using_sax(imagefile=open(fn),callback=cb) # # Typeset the information # tab = ttable() tab.header = "File extension popularity and average size (suppressing 0-len files)" tab.col_headings = [['Ext','Count','Average Size','Max','Std Dev']] tab.omit_row = [[0,'']] extension_len_histogram.statcol = ['iaverage','maxx','istddev'] print(extension_len_histogram.typeset(tab=tab)) # # Information about fragmentation patterns # tab = ttable() tab.header="Fragmentation pattern by file system and file type:" tab.col_headings = [['Ext','Count','Average Size','Max','Std Dev']] tab.omit_row = [[0,'']] extension_fragments_histogram.statcol = ['iaverage','maxx','istddev'] print(extension_fragments_histogram.typeset(tab=tab)) exit(0) for fstype in fstypes: for ftype in ['jpg','pdf','doc','txt']: len1stats = statbag() len2stats = statbag() delta_hist = histogram() delta_re = re.compile("(\d+)\-?(\d+)? ?(\d+)\-?(\d+)?") for i in filter((lambda f: f.ext()==ftype and f.fragments==2),all_files): runs = False if(hasattr(i,'block_runs')): runs = i.block_runs if(hasattr(i,'sector_runs')): runs = i.sector_runs if not runs: continue m = delta_re.search(runs) r = [] for j in range(1,5): try: r.append(int(m.group(j))) except TypeError: r.append(int(m.group(j-1))) len1 = r[1] - r[0] + 1 len2 = r[3] - r[2] + 1 delta = r[2]-r[1] len1stats.addx(len1) len2stats.addx(len2) delta_hist.add(delta) if len1stats.count()>0: print("\n\n") print("fstype:",fstype," ftype:",ftype) print("len1 average: %f stddev: %f" % (len1stats.average(),len1stats.stddev())) print("len2 average: %f stddev: %f" % (len2stats.average(),len2stats.stddev())) print("delta average: %f" % delta_hist.average()) print("delta histogram:") delta_hist.print_top(10) exit(0) print("Partition histogram:") partition_histogram.print_top(n=100) print("Counts by extension:") extension_len_histogram.print_top(n=100) print("Fragments by extension:") extension_fragments_histogram.print_top(n=100) exit(0) for fstype in fstypes: if fstype=='(unrecognized)': continue print(fstype,"Partitions:") def isfstype(x): return x.fstype==fstype these_parts = filter(isfstype,all_parts) these_files = [] for part in these_parts: these_files.extend(part.files) print(fragmentation_table(these_files)) exit(0) sys.exit(0) # # Typeset information about file extensions # hist_exts = histogram2d() hist_exts.topn = 20 for i in all_files: if i.size>0 and i.fragments>0: hist_exts.add(i.ext(),i.size) tab = table() tab.header = "File extension popularity and average size (suppressing 0-len files)" tab.col_headings = ['Ext','Count','Average Size','Max','Std Dev'] tab.omit_row = [[0,'']] hist_exts.statcol = ['iaverage','maxx','istddev'] print(hist_exts.typeset(t=tab)) hist_exts = histogram2d() hist_exts.topn = 20 for i in all_files: if i.fragments>0: hist_exts.add(i.ext(),i.fragments) tab = table() tab.header = "Fragmentation by file extension (suppressing files with 0 fragments)" tab.col_headings = ['Ext','Count','Avg Fragments','Max','Std Dev'] tab.omit_row = [[0,'']] hist_exts.statcol = ['average','maxx','stddev'] print(hist_exts.typeset(t=tab)) print("===========================") # # Typeset the File Systems on Drives table # tab = table() tab.header = "File Systems on Drives" tab.col_headings = ["FS Type","Drives","MBytes"] tab.col_totals = [1,2] fstypeh.statcol = 'sumx' print(fstypeh.typeset(t=tab)) # # Typeset overall fragmentation stats # print(fragmentation_table(all_files))
def generate_confusion(): if os.path.exists(expname("confusion.txt")): print("Confusion matrix already exists") return if not os.path.exists(model_file()): raise RuntimeError("Cannot generate confusion matrix. Model file {} does not exist".format(model_file())) print("Generating confusion matrix") t = ttable() db['test_blocksize'] = args.test_blocksize sceadan_score_rows = {} files_per_type = {} blocks_per_type = {} classified_types = set() for key, value in db['RESULTS'].items(): temp = value.copy() sceadan_score_rows[key] = temp['TALLY'] classified_types = classified_types.union(set(temp['TALLY'].keys())) files_per_type[key] = temp['FILE_COUNT'] blocks_per_type[key] = temp['BLOCK_COUNT'] classtypes = [t.upper() for t in sorted(classified_types)] t.append_head([' ','file ', 'block']) t.append_head(['type','count', 'count'] + classtypes) t.set_col_alignment(2,t.RIGHT) t.set_col_alignment(3,t.RIGHT) total_events = 0 total_correct = 0 percent_correct_sum= 0 rowcounter = 0 for ftype in filetypes(): FTYPE = ftype.upper() tally = sceadan_score_rows[ftype] count = sum(tally.values()) total_events += count total_correct += tally.get(ftype.upper(),0) percent_correct = tally.get(ftype.upper(),0)*100.0 / count percent_correct_sum += percent_correct # Generate each row of the output data = [FTYPE, files_per_type.get(FTYPE,0), blocks_per_type.get(FTYPE,0)] + ["{:3.0f}".format(tally.get(f,0)*100.0/count) for f in classtypes] t.append_data(data) rowcounter += 1 if rowcounter % 5 ==0: t.append_data(['']) f = openexp("confusion.txt","w") f.write(t.typeset(mode='text')) print(t.typeset(mode='text')) db['overall_accuracy'] = (total_correct*100.0)/total_events db['average_accuracy_per_class'] = percent_correct_sum/len(filetypes()) def info(n): """Print a value from the database and print in confusion.txt""" try: v = str(db[n])[0:120] val = "{}: {}".format(n.replace("_"," "),v) f.write(val+"\n") print(val) except KeyError as e: f.write("key {} not found\n".format(n)) print("Keys in database:",list(db.keys())) info('train_blocksize') info('test_blocksize') info('liblinear_train_command') info('overall_accuracy') info('average_accuracy_per_class')
def process_files(fn): drive_files = {} # index of drives all_parts = [] all_files = [] files_by_md5 = {} # a dictionary of sets of fiobject, indexed by md5 extension_len_histogram = histogram2d() extension_fragments_histogram = histogram2d() partition_histogram = histogram2d() def cb(fi): # add the md5 to the set if fi.is_file() and fi.filesize(): files_by_md5.get(fi.md5, set()).add(fi) ext = fi.ext() if not ext: print fi.meta_type(), fi extension_len_histogram.add(ext, fi.filesize()) extension_fragments_histogram.add(ext, fi.fragments()) partition_histogram.add(fi.partition(), fi.filesize()) if fn.endswith('xml'): fiwalk.fiwalk_using_sax(xmlfile=open(fn), callback=cb) else: fiwalk.fiwalk_using_sax(imagefile=open(fn), callback=cb) # # Typeset the information # tab = ttable() tab.header = "File extension popularity and average size (suppressing 0-len files)" tab.col_headings = [['Ext', 'Count', 'Average Size', 'Max', 'Std Dev']] tab.omit_row = [[0, '']] extension_len_histogram.statcol = ['iaverage', 'maxx', 'istddev'] print extension_len_histogram.typeset(tab=tab) # # Information about fragmentation patterns # tab = ttable() tab.header = "Fragmentation pattern by file system and file type:" tab.col_headings = [['Ext', 'Count', 'Average Size', 'Max', 'Std Dev']] tab.omit_row = [[0, '']] extension_fragments_histogram.statcol = ['iaverage', 'maxx', 'istddev'] print extension_fragments_histogram.typeset(tab=tab) exit(0) for fstype in fstypes: for ftype in ['jpg', 'pdf', 'doc', 'txt']: len1stats = statbag() len2stats = statbag() delta_hist = histogram() delta_re = re.compile("(\d+)\-?(\d+)? ?(\d+)\-?(\d+)?") for i in filter((lambda (f): f.ext() == ftype and f.fragments == 2), all_files): runs = False if (hasattr(i, 'block_runs')): runs = i.block_runs if (hasattr(i, 'sector_runs')): runs = i.sector_runs if not runs: continue m = delta_re.search(runs) r = [] for j in range(1, 5): try: r.append(int(m.group(j))) except TypeError: r.append(int(m.group(j - 1))) len1 = r[1] - r[0] + 1 len2 = r[3] - r[2] + 1 delta = r[2] - r[1] len1stats.addx(len1) len2stats.addx(len2) delta_hist.add(delta) if len1stats.count() > 0: print "\n\n" print "fstype:", fstype, " ftype:", ftype print "len1 average: %f stddev: %f" % (len1stats.average(), len1stats.stddev()) print "len2 average: %f stddev: %f" % (len2stats.average(), len2stats.stddev()) print "delta average: %f" % delta_hist.average() print "delta histogram:" delta_hist.print_top(10)
def process_files(fn): drive_files = {} # index of drives all_parts = [] all_files = [] files_by_md5 = {} # a dictionary of sets of fiobject, indexed by md5 extension_len_histogram = histogram2d() extension_fragments_histogram = histogram2d() partition_histogram = histogram2d() def cb(fi): # add the md5 to the set if fi.is_file() and fi.filesize(): files_by_md5.get(fi.md5,set()).add(fi) ext = fi.ext() if not ext: print fi.meta_type(),fi extension_len_histogram.add(ext,fi.filesize()) extension_fragments_histogram.add(ext,fi.fragments()) partition_histogram.add(fi.partition(),fi.filesize()) if fn.endswith('xml'): fiwalk.fiwalk_using_sax(xmlfile=open(fn),callback=cb) else: fiwalk.fiwalk_using_sax(imagefile=open(fn),callback=cb) # # Typeset the information # tab = ttable() tab.header = "File extension popularity and average size (suppressing 0-len files)" tab.col_headings = [['Ext','Count','Average Size','Max','Std Dev']] tab.omit_row = [[0,'']] extension_len_histogram.statcol = ['iaverage','maxx','istddev'] print extension_len_histogram.typeset(tab=tab) # # Information about fragmentation patterns # tab = ttable() tab.header="Fragmentation pattern by file system and file type:" tab.col_headings = [['Ext','Count','Average Size','Max','Std Dev']] tab.omit_row = [[0,'']] extension_fragments_histogram.statcol = ['iaverage','maxx','istddev'] print extension_fragments_histogram.typeset(tab=tab) exit(0) for fstype in fstypes: for ftype in ['jpg','pdf','doc','txt']: len1stats = statbag() len2stats = statbag() delta_hist = histogram() delta_re = re.compile("(\d+)\-?(\d+)? ?(\d+)\-?(\d+)?") for i in filter( (lambda(f): f.ext()==ftype and f.fragments==2),all_files): runs = False if(hasattr(i,'block_runs')): runs = i.block_runs if(hasattr(i,'sector_runs')): runs = i.sector_runs if not runs: continue m = delta_re.search(runs) r = [] for j in range(1,5): try: r.append(int(m.group(j))) except TypeError: r.append(int(m.group(j-1))) len1 = r[1] - r[0] + 1 len2 = r[3] - r[2] + 1 delta = r[2]-r[1] len1stats.addx(len1) len2stats.addx(len2) delta_hist.add(delta) if len1stats.count()>0: print "\n\n" print "fstype:",fstype," ftype:",ftype print "len1 average: %f stddev: %f" % (len1stats.average(),len1stats.stddev()) print "len2 average: %f stddev: %f" % (len2stats.average(),len2stats.stddev()) print "delta average: %f" % delta_hist.average() print "delta histogram:" delta_hist.print_top(10)