def __init__(self, dname1, dname2, *, out, both=False, mode='text'): self.b1 = bulk_extractor_reader.BulkReport(dname1) self.b2 = bulk_extractor_reader.BulkReport(dname2) self.out = out self.both = both self.mode = mode self.only_features = set() self.only_features.update(self.b1.feature_files()) self.only_features.update(self.b2.feature_files())
def datacheck_checkreport(outdir): """Reports on whether the output in outdir matches the datacheck report""" print("opening ",outdir) b = bulk_extractor_reader.BulkReport(outdir) found_features = {} print("Feature files:",list(b.feature_files())) print("Histogram files:",list(b.histogram_files())) for fn in b.feature_files(): print("Reading feature file {}".format(fn)) for (pos,feature,context) in b.read_features(fn): found_features[pos] = feature print("Now reading features from data_check.txt") not_found = {} report_mismatches = False found_count = 0 for line in open("data_check.txt","rb"): y = bulk_extractor_reader.parse_feature_line(line) if y: (pos,feature,context) = y if pos in found_features: found_count += 1 #print("{} found".format(pos.decode('utf-8'))) if found_features[pos]!=feature and report_mismatches: if found_features[pos]!=b'<CACHED>' and feature!=b'<CACHED>': print(" {} != {}".format(feature,found_features[pos])) else: not_found[pos] = feature for pos in sorted(not_found): print("{} not found {}".format(pos,not_found[pos])) print("Total features found: {}".format(found_count)) print("Total features not found: {}".format(len(not_found)))
def make_zip(dname): archive_name = dname+".zip" b = bulk_extractor_reader.BulkReport(dname) z = zipfile.ZipFile(archive_name,compression=zipfile.ZIP_DEFLATED,mode="w") print("Creating ZIP archive {}".format(archive_name)) for fname in b.all_files: print(" adding {} ...".format(fname)) z.write(os.path.join(dname,fname),arcname=os.path.basename(fname))
def validate_report(fn): """Make sure all of the lines in all of the files in the outdir are UTF-8 and that the feature files have 3 or more fields on each line. """ import glob, os.path print("\nValidate Report: ", fn) res = {} if os.path.isdir(fn) or fn.endswith(".zip"): b = bulk_extractor_reader.BulkReport(fn) for fn in b.feature_files(): if os.path.basename(fn) in str(args.ignore): print("** ignore {} **".format(fn)) continue validate_file(b.open(fn, 'rb'), FEATURE_FILE) else: validate_file(open(fn, 'rb'))
def process(report, fsc): b1 = bulk_extractor_reader.BulkReport(report, do_validate=False) print("Reading email.txt") try: for line in b1.open("email.txt"): fsc.write(line) except KeyError: pass try: h = b1.read_histogram("email_histogram.txt") for (a) in h: all_emails.add(a) except KeyError: pass print("Processed {}; now {} unique emails".format(report, len(all_emails)))
def read_filemap(): if args.xmlfile: rundb.read_xmlfile(args.xmlfile) if len(rundb) == 0: raise RuntimeError( "\nERROR: No files detected in XML file {}\n".format( args.xmlfile)) return if args.image_filename: imagefile = args.image_filename else: imagefile = bulk_extractor_reader.BulkReport( args.bulk_extractor_report).image_filename() rundb.read_imagefile(imagefile) if len(rundb) == 0: raise RuntimeError( "\nERROR: No files detected in image file {}\n".format( imagefile))
def analyze_outdir(outdir): """Print statistics about an output directory""" print("Analyze {}".format(outdir)) b = bulk_extractor_reader.BulkReport(outdir) print("bulk_extractor version: {}".format(b.version())) print("Filename: {}".format(b.imagefile())) # Print which scanners were run and how long they took analyze_reportxml(b.xmldoc) hfns = list(b.histograms()) print("") print("Histogram Files: {}".format(len(hfns))) def get_firstline(fn): """Returns the first line that is not a comment""" for line in b.open(fn, 'rb'): if bulk_extractor_reader.is_comment_line(line): continue return line[:-1] for fn in sorted(hfns): h = b.read_histogram(fn) firstline = get_firstline(fn) if (type(firstline) == bytes and type(firstline) != str): firstline = firstline.decode('utf-8') print(" {:>25} entries: {:>10,} (top: {})".format( fn, len(h), firstline)) fnpart = ".".join(b.imagefile().split('/')[-1].split('.')[:-1]) ffns = list(b.feature_files()) print("") print("Feature Files: {}".format(len(ffns))) for fn in sorted(ffns): lines = 0 for line in b.open(fn, 'rb'): if not bulk_extractor_reader.is_comment_line(line): lines += 1 print(" {:>25} features: {:>12,} {}".format( fn, lines, analyze_warning(fnpart, fn, lines)))
def process(out,dname1,dname2): mode = 'text' if options.html: mode='html' b1 = bulk_extractor_reader.BulkReport(dname1) b2 = bulk_extractor_reader.BulkReport(dname2) t = ttable.ttable() t.append_data(['bulk_diff.py Version:',bulk_diff_version]) t.append_data(['PRE Image:',b1.image_filename()]) t.append_data(['POST Image:',b2.image_filename()]) out.write(t.typeset(mode=mode)) for i in [1,2]: if i==1: a=b1;b=b2 else: b=b1;a=b2; r = a.files.difference(b.files) if r: print("Files only in {}:".format(a.name)) for f in r: if ".txt" in f: print(" %s (%d lines)" % (f,a.count_lines(f))) else: print(" %s" % (f)) # Report interesting differences based on the historgrams. # Output Example: """ # in PRE # in POST ∆ Feature 10 20 10 [email protected] 8 17 9 [email protected] 11 16 5 [email protected] """ b1_histograms = set(b1.histogram_files()) b2_histograms = set(b2.histogram_files()) common_histograms = b1_histograms.intersection(b2_histograms) if options.html: out.write("<ul>\n") for histogram_file in sorted(histogram_files): out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file,histogram_file)) out.write("</ul>\n<hr/>\n") for histogram_file in sorted(common_histograms): diffcount = 0 if options.html: out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file,histogram_file)) else: out.write('\n') t = ttable.ttable() t.set_col_alignment(0,t.RIGHT) t.set_col_alignment(1,t.RIGHT) t.set_col_alignment(2,t.RIGHT) t.set_col_alignment(3,t.LEFT) t.set_title(histogram_file) t.append_head(['# in PRE','# in POST','∆','Value']) b1.hist = b1.read_histogram(histogram_file) b2.hist = b2.read_histogram(histogram_file) b1.keys = set(b1.hist.keys()) b2.keys = set(b2.hist.keys()) # Create the output, then we will sort on col 1, 2 and 4 data = [] for feature in b1.keys.union(b2.keys): v1 = b1.hist.get(feature,0) v2 = b2.hist.get(feature,0) if v1!=v2: diffcount += 1 if v2>v1 or (v2==v1 and options.same) or (v2<v1 and options.smaller): data.append((v1, v2, v2-v1, feature.decode('utf-8'))) # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value def mysortkey(a): return (-a[2],a[3],a[1],a[0]) if data: for row in sorted(data,key=mysortkey): t.append_data(row) out.write(t.typeset(mode=mode)) if diffcount==0: if options.html: out.write("{}: No differences\n".format(histogram_file)) else: out.write("{}: No differences\n".format(histogram_file)) if options.features: for feature_file in b1.feature_files(): if feature_file not in b2.feature_files(): continue print("Compare features",feature_file) for p in [1,2]: if p==1: a = b1; b = b2 else: a = b2; b = a a_features = {} for line in a.open(feature_file): r = bulk_extractor_reader.parse_feature_line(line) if not r: continue a_features[r[0]] = r[1] for line in b.open(feature_file): r = bulk_extractor_reader.parse_feature_line(line) if not r: continue if r[0] not in a_features: print("{} {} is only in {}".format(r[0],r[1],a.name))
def ingest(report_fn): import time c = conn.cursor() c.execute("select count(*) from drives where report_fn=?",(report_fn,)) if c.fetchone()[0]>0 and args.reimport==False: print("{} already imported".format(report_fn)) return try: br = bulk_extractor_reader.BulkReport(report_fn) image_filename = br.image_filename() except IndexError: print("No image filename in bulk_extractor report for {}; will not ingest".format(report_fn)) return except OSError: print("Cannot open {}; will not ingest".format(report_fn)) return except KeyError: print("Cannot open {}; will not ingest".format(report_fn)) return if args.reimport==False: driveid = get_driveid(image_filename,create=False) if driveid: print("{} already imported".format(image_filename)) return driveid = get_driveid(image_filename,report_fn,create=True) print("Ingesting {} as driveid {}".format(br.image_filename(),driveid)) t0 = time.time() if args.reimport: # Make sure that this driveid is not in the feature tables c.execute("DELETE FROM feature_drive_counts where driveid=?",(driveid,)) # initial version we are ingesting search terms, winpe executables, and email addresses for (search,count) in br.read_histogram_entries("url_searches.txt"): if search.startswith(b"cache:"): continue featureid = get_featureid(search); c.execute("INSERT INTO feature_drive_counts (driveid,feature_type,featureid,count) values (?,?,?,?);", (driveid,SEARCH_TYPE,featureid,count)) # Add counts for email addresses for (email,count) in br.read_histogram_entries("email_histogram.txt"): #print("Add email {} = {}".format(email,count)) featureid = get_featureid(email); c.execute("INSERT INTO feature_drive_counts (driveid,feature_type,featureid,count) values (?,?,?,?);", (driveid,EMAIL_TYPE,featureid,count)) # Add hashes for Windows executables import collections pe_header_counts = collections.Counter() for r in br.read_features("winpe.txt"): try: (pos,feature,context) = r featureid = get_featureid(feature) pe_header_counts[featureid] += 1 except ValueError: print("got {} values".format(len(r))) for (featureid,count) in pe_header_counts.items(): c.execute("INSERT INTO feature_drive_counts (driveid,feature_type,featureid,count) values (?,?,?,?);", (driveid,WINPE_TYPE,featureid,count)) conn.commit() t1 = time.time() print("Driveid {} imported in {} seconds\n".format(driveid,t1-t0))
def analyze_outdir(outdir): """Print statistics about an output directory""" print("Analyze {}".format(outdir)) b = bulk_extractor_reader.BulkReport(outdir) print("bulk_extractor version: {}".format(b.version())) print("Image filename: {}".format(b.image_filename())) # Print which scanners were run and how long they took analyze_reportxml(b.xmldoc) hfns = list(b.histogram_files()) # histogram files print("") print("Histogram Files: {}".format(len(hfns))) def get_firstline(fn): """Returns the first line that is not a comment""" for line in b.open(fn,'rb'): if bulk_extractor_reader.is_comment_line(line): continue return line[:-1] for fn in sorted(hfns): h = b.read_histogram(fn) firstline = get_firstline(fn) if(type(firstline)==bytes and type(firstline)!=str): firstline = firstline.decode('utf-8') print(" {:>25} entries: {:>10,} (top: {})".format(fn,len(h),firstline)) fnpart = ".".join(b.image_filename().split('/')[-1].split('.')[:-1]) ffns = sorted(list(b.feature_files())) if ffns: features = {} print("") print("Feature Files: {}".format(len(ffns))) for fn in ffns: # feature files lines = 0 for line in b.open(fn,'r'): if not bulk_extractor_reader.is_comment_line(line): lines += 1 features[fn] = lines print(" {:>25} features: {:>12,} {}".format(fn,lines,analyze_warning(fnpart,fn,lines))) # If there is a SQLite database, analyze that too! if args.featurefile and args.featuresql: import sqlite3 conn = sqlite3.connect(os.path.join(outdir,"report.sqlite")) if conn: c = conn.cursor() c.execute("PRAGMA cache_size = 200000") print("Comparing SQLite3 database to feature files:") for fn in ffns: try: table = "f_"+fn.lower().replace(".txt","") cmd = "select count(*) from "+table print(cmd) c.execute(cmd); ct = c.fetchone()[0] print("{}: {} {}".format(fn,features[fn],ct)) # Now check them all to make sure that the all match count = 0 for line in b.open(fn,'r'): ary = bulk_extractor_reader.parse_feature_line(line) if ary: (path,feature) = ary[0:2] path = path.decode('utf-8') feature = feature.decode('utf-8') c.execute("select count(*) from "+table+" where path=? and feature_eutf8=?",(path,feature)) ct = c.fetchone()[0] if ct==1: #print("feature {} {} in table {} ({})".format(path,feature,table,ct)) pass if ct==0: #pass print("feature {} {} not in table {} ({})".format(path,feature,table,ct)) count += 1 if count>args.featuretest: break except sqlite3.OperationalError as e: print(e)
def process(self): ber = bulk_extractor_reader.BulkReport(self.fn,do_validate=False) for ff in ber.feature_files(): if ff in ignored_features: continue print("Processing {} in {}".format(ff,self.fn)) self.process_feature_file(ber,ff)
def process(out, dname1, dname2): mode = 'text' if options.html: mode = 'html' b1 = bulk_extractor_reader.BulkReport(dname1) b2 = bulk_extractor_reader.BulkReport(dname2) t = ttable.ttable() t.append_data(['bulk_diff.py Version:', bulk_diff_version]) t.append_data(['PRE Image:', b1.image_filename()]) t.append_data(['POST Image:', b2.image_filename()]) out.write(t.typeset(mode=mode)) if b1.files.difference(b2.files): print("Files only in %s:\n %s" % (b1.name, " ".join(b1.files.difference(b2.files)))) if b2.files.difference(b1.files): print("Files only in %s:\n %s" % (b2.name, " ".join(b2.files.difference(b1.files)))) # Report interesting differences based on the historgrams. # Output Example: """ # in PRE # in POST ∆ Feature 10 20 10 [email protected] 8 17 9 [email protected] 11 16 5 [email protected] """ common_files = b1.files.intersection(b2.files) histogram_files = filter(lambda a: "histogram" in a, common_files) if options.html: out.write("<ul>\n") for histogram_file in sorted(histogram_files): out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file, histogram_file)) out.write("</ul>\n<hr/>\n") diffcount = 0 for histogram_file in sorted(histogram_files): if options.html: out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file, histogram_file)) else: out.write('\n') t = ttable.ttable() t.set_col_alignment(0, t.RIGHT) t.set_col_alignment(1, t.RIGHT) t.set_col_alignment(2, t.RIGHT) t.set_col_alignment(3, t.LEFT) t.set_title(histogram_file) t.append_head(['# in PRE', '# in POST', '∆', 'Value']) b1.hist = b1.read_histogram(histogram_file) b2.hist = b2.read_histogram(histogram_file) b1.keys = set(b1.hist.keys()) b2.keys = set(b2.hist.keys()) # Create the output, then we will sort on col 1, 2 and 4 data = [] for feature in b1.keys.union(b2.keys): v1 = b1.hist.get(feature, 0) v2 = b2.hist.get(feature, 0) if v1 != v2: diffcount += 1 if v2 > v1 or (v2 == v1 and options.same) or (v2 < v1 and options.smaller): data.append((v1, v2, v2 - v1, feature)) # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value def mysortkey(a): return (-a[2], a[3], a[1], a[0]) if data: for row in sorted(data, key=mysortkey): t.append_data(row) out.write(t.typeset(mode=mode)) if diffcount == 0: if options.html: out.write("{}: No differences\n\n".format(histogram_file)) else: out.write("{}: No differences\n\n".format(histogram_file))
imagefile)) if args.path: read_filemap() print("Locating {}: ".format(args.path)) res = rundb.search_path(args.path.encode('utf-8')) if res: print("Start: {}\nLength: {}\nFile Name: {}\nFile MD5: {}". format(res[0], res[1], res[2], res[3])) else: print("NOT FOUND") exit(0) # Open the report try: report = bulk_extractor_reader.BulkReport(args.bulk_extractor_report) except UnicodeDecodeError: print("{}/report.xml file contains invalid XML. Cannot continue\n". format(args.bulk_extractor_report)) exit(1) if args.list: print("Feature files in {}:".format(args.bulk_extractor_report)) for fn in report.feature_files(): print(fn) exit(1) # Make sure that the user has specified feature files if not args.featurefiles and not args.all: raise RuntimeError( "Please request a specific feature file or --all feature files")
# Create the correlators, one for each feature file correlators = set() for name in args.idfeatures.split(","): correlators.add(Correlator(name)) # Create the br readers, one for each report br_readers = set() for fname in args.reports: # On windows the '*' may not be expanded.... if '*' in fname: fns = glob.glob(fname) else: fns = [fname] for fn in fns: try: br_readers.add(bulk_extractor_reader.BulkReport(fn)) except IOError: print("{} is an invalid bulk_extractor report. Cannot continue. STOP.\n".format(fn)) exit(1) # Now read each feature file from each reader # Either ingest (in the case of cda) or create the context stop list (if making combined) for c in correlators: context_stop_list = set() for br in br_readers: b = br.open(c.name+".txt",mode='r') if args.makecombined: count = c.ingest_feature_file(b,context_stop_list) else: count = c.ingest_feature_file(b,None) if args.makecombined: