def ingest_feature_file(self,f,context_stop_list): """Read the lines in a feature file; returns how many lines were procesed""" drivename = None count = 0 for line in f: if type(line)==bytes: line = line.decode('utf-8') m = bulk_extractor_reader.get_property_line(line) if m: if m[0]=='Filename': drivename = m[1] self.drives.add(drivename) print("Scanning {} for {}".format(drivename,self.name)) if bulk_extractor_reader.is_comment_line(line): continue count += 1 if context_stop_list != None: (offset,feature,context) = line.split("\t") context_stop_list.add((feature,context)) continue feature = line.split("\t")[1] featuredict = self.features[feature] featuredict[drivename] = featuredict.get(drivename,0)+1 print(" processed {} features".format(count)) return count
def validate_file(f, kind): kml = "kml" in f.name linenumber = 0 print("Validate UTF-8 encoding in ", f.name) for lineb in f: linenumber += 1 lineb = lineb[:-1] # remove the \n try: line = lineb.decode('utf-8') except UnicodeDecodeError as e: print("{}:{} {} {}".format(f.name, linenumber, str(e), asbinary(lineb))) continue if bulk_extractor_reader.is_comment_line(line): continue # don't test comments if bulk_extractor_reader.is_histogram_line(line): continue # don't test if kind == FEATURE_FILE: fields = line.split("\t") r = invalid_feature_file_line(line, fields) if r: print("{}: {:8} {} Invalid feature file line: {}".format( f.name, linenumber, r, line)) if kml and fields[1].count("kml") != 2: print("{}: {:8} Invalid KML line: {}".format( f.name, linenumber, line))
def process_featurefile2(rundb,infile,outfile): """Returns features from infile, determines the file for each, writes results to outfile""" # Stats unallocated_count = 0 feature_count = 0 features_encoded = 0 located_count = 0 if args.terse: outfile.write(b"# Position\tFeature\tFilename\n") else: outfile.write(b"# Position\tFeature\tContext\tFilename\tFile MD5\n") t0 = time.time() for line in infile: if bulk_extractor_reader.is_comment_line(line): outfile.write(line) continue (offset,feature,context) = line[:-1].split(b'\t') feature_count += 1 if b"-" in offset: ioffset = decode_path_offset(offset) features_encoded += 1 else: ioffset = int(offset) tpl = rundb.search(ioffset) if tpl: located_count += 1 fname = tpl[2].encode('utf-8') # THIS MIGHT GENERATE A UNICODE ERROR if tpl[3]: md5val = tpl[3].encode('utf-8') else: md5val = b"" else: unallocated_count += 1 fname = b"" md5val = b"" outfile.write(offset) outfile.write(b'\t') outfile.write(feature) if not args.terse: outfile.write(b'\t') outfile.write(context) outfile.write(b'\t') outfile.write(fname) if not args.terse: outfile.write(b'\t') outfile.write(md5val) outfile.write(b'\n') t1 = time.time() for (title,value) in [["# Total features input: {}",feature_count], ["# Total features located to files: {}",located_count], ["# Total features in unallocated space: {}",unallocated_count], ["# Total features in encoded regions: {}",features_encoded], ["# Total processing time: {:.2} seconds",t1-t0]]: outfile.write((title+"\n").format(value).encode('utf-8'))
def bulk_extractor_ips(beoutdir): """ A generator for the data lines in Bulk Extractor's ip.txt file (the IP addresses extracted from binary structures, not from text). """ if not (os.path.isdir(beoutdir) and os.path.exists(os.path.join(beoutdir, "report.xml"))): raise ValueError("Bulk Extractor input is not a BE output directory\n\tParameter: %r." % beoutdir) #This BE feature file should not be opened in binary mode with open(os.path.join(beoutdir, "ip.txt"), "r") as ip_file: for line in ip_file: if bulk_extractor_reader.is_comment_line(line): continue line_parts = line[:-1].split("\t") if len(line_parts) != 3: raise ValueError("Bulk Extractor ip.txt file has line in unexpected format; halting to await code revisions.\n\tLine with characters escaped: %r." % line) yield tuple(line_parts)
def sample(outdir,fn): line_numbers = get_lines_array(report.open(fn,"r")) count = min(args.count,len(line_numbers)) print("{} has {} lines".format(fn,len(line_numbers))) lines_to_sample = sorted(random.sample(line_numbers,count)) line_number = 0 with open(os.path.join(outdir,fn),"w") as out: out.write("# -*- mode:text; truncate-lines:t -*-\n") out.write("# Sampled {} out of {}\n".format(count,len(line_numbers))) out.write("# Place '=' or 'y' in front of correct classifications and '-' or 'x' in front of incorrect ones\n") with report.open(fn) as f: for line in f: line_number += 1 if is_comment_line(line): out.write(line.decode('utf-8')) if line_number in lines_to_sample: out.write("{}:\t".format(line_number)) out.write(line.decode('utf-8'))
def ingest_histogram_file(self,f): drivename = None for line in f: if type(line)==bytes: line = line.decode('utf-8') m = bulk_extractor_reader.get_property_line(line) if m: if m[0]=='Filename': drivename = m[1] self.drives.add(drivename) print("Scanning {} for {}".format(drivename,self.name)) continue if bulk_extractor_reader.is_comment_line(line): continue fields = line.split("\t") count = int(fields[0][2:]) feature = fields[1].strip() featuredict = self.features[feature] featuredict[drivename] = featuredict.get(drivename,0)+count
def sample(outdir, fn): line_numbers = get_lines_array(report.open(fn, "r")) count = min(args.count, len(line_numbers)) print("{} has {} lines".format(fn, len(line_numbers))) lines_to_sample = sorted(random.sample(line_numbers, count)) line_number = 0 with open(os.path.join(outdir, fn), "w") as out: out.write("# -*- mode:text; truncate-lines:t -*-\n") out.write("# Sampled {} out of {}\n".format(count, len(line_numbers))) out.write( "# Place '=' or 'y' in front of correct classifications and '-' or 'x' in front of incorrect ones\n" ) with report.open(fn) as f: for line in f: line_number += 1 if is_comment_line(line): out.write(line.decode('utf-8')) if line_number in lines_to_sample: out.write("{}:\t".format(line_number)) out.write(line.decode('utf-8'))
def analyze_outdir(outdir): """Print statistics about an output directory""" print("Analyze {}".format(outdir)) b = bulk_extractor_reader.BulkReport(outdir) print("bulk_extractor version: {}".format(b.version())) print("Filename: {}".format(b.imagefile())) # Print which scanners were run and how long they took analyze_reportxml(b.xmldoc) hfns = list(b.histograms()) print("") print("Histogram Files: {}".format(len(hfns))) def get_firstline(fn): """Returns the first line that is not a comment""" for line in b.open(fn, 'rb'): if bulk_extractor_reader.is_comment_line(line): continue return line[:-1] for fn in sorted(hfns): h = b.read_histogram(fn) firstline = get_firstline(fn) if (type(firstline) == bytes and type(firstline) != str): firstline = firstline.decode('utf-8') print(" {:>25} entries: {:>10,} (top: {})".format( fn, len(h), firstline)) fnpart = ".".join(b.imagefile().split('/')[-1].split('.')[:-1]) ffns = list(b.feature_files()) print("") print("Feature Files: {}".format(len(ffns))) for fn in sorted(ffns): lines = 0 for line in b.open(fn, 'rb'): if not bulk_extractor_reader.is_comment_line(line): lines += 1 print(" {:>25} features: {:>12,} {}".format( fn, lines, analyze_warning(fnpart, fn, lines)))
def analyze_outdir(outdir): """Print statistics about an output directory""" print("Analyze {}".format(outdir)) b = bulk_extractor_reader.BulkReport(outdir) print("bulk_extractor version: {}".format(b.version())) print("Image filename: {}".format(b.image_filename())) # Print which scanners were run and how long they took analyze_reportxml(b.xmldoc) hfns = list(b.histogram_files()) print("") print("Histogram Files: {}".format(len(hfns))) def get_firstline(fn): """Returns the first line that is not a comment""" for line in b.open(fn,'rb'): if bulk_extractor_reader.is_comment_line(line): continue return line[:-1] for fn in sorted(hfns): h = b.read_histogram(fn) firstline = get_firstline(fn) if(type(firstline)==bytes and type(firstline)!=str): firstline = firstline.decode('utf-8') print(" {:>25} entries: {:>10,} (top: {})".format(fn,len(h),firstline)) fnpart = ".".join(b.image_filename().split('/')[-1].split('.')[:-1]) ffns = list(b.feature_files()) print("") print("Feature Files: {}".format(len(ffns))) for fn in sorted(ffns): lines = 0 for line in b.open(fn,'rb'): if not bulk_extractor_reader.is_comment_line(line): lines += 1 print(" {:>25} features: {:>12,} {}".format(fn,lines,analyze_warning(fnpart,fn,lines)))
def get_lines_array(f): """Returns an array of integers corresponding to each line in the feature file""" line_number = 0 line_numbers = [] if args.pattern: pattern = args.pattern.encode('utf-8') else: pattern = None if args.xpattern: xpattern = args.xpattern.encode('utf-8') else: xpattern = None for line in f: line_number += 1 if pattern and not pattern in line.split(b"\t")[0]: continue if xpattern and xpattern in line.split(b"\t")[0]: continue if is_comment_line(line): continue line_numbers.append(line_number) return line_numbers
def validate_file(f,kind): kml = "kml" in f.name linenumber = 0 print("Validate UTF-8 encoding in ",f.name) for lineb in f: linenumber += 1 lineb = lineb[:-1] # remove the \n try: line = lineb.decode('utf-8') except UnicodeDecodeError as e: print("{}:{} {} {}".format(f.name,linenumber,str(e),asbinary(lineb))) continue if bulk_extractor_reader.is_comment_line(line): continue # don't test comments if bulk_extractor_reader.is_histogram_line(line): continue # don't test if kind==FEATURE_FILE: fields = line.split("\t") r = invalid_feature_file_line(line,fields) if r: print("{}: {:8} {} Invalid feature file line: {}".format(f.name,linenumber,r,line)) if kml and fields[1].count("kml")!=2: print("{}: {:8} Invalid KML line: {}".format(f.name,linenumber,line))
def validate_openfile(f): fn = f.name if fn.endswith('.xml') or fn.endswith('.dmp') or fn.endswith("_tags.txt") or "wordlist" in fn: is_feature_file = False return else: is_feature_file = True # now read linenumber = 0 print("Validate ",fn) for lineb in f: linenumber += 1 lineb = lineb[:-1] try: line = lineb.decode('utf-8') except UnicodeDecodeError as e: print("{}:{} {} {}".format(fn,linenumber,str(e),asbinary(lineb))) if bulk_extractor_reader.is_comment_line(line): continue # don't test if bulk_extractor_reader.is_histogram_line(line): continue # don't test if is_feature_file and not valid_feature_file_line(line): print("{}: {:8} Invalid feature file line: {}".format(fn,linenumber,line))
def process_featurefile2(rundb,infile,outfile): """Returns features from infile, determines the file for each, writes results to outfile""" # Stats unallocated_count = 0 feature_count = 0 features_encoded = 0 located_count = 0 outfile.write(b"# Position\tFeature") if not args.terse: outfile.write(b"\tContext") outfile.write(b"\tFilename\tMD5") if args.mactimes: outfile.write(b"\tcrtime\tctime\tmtime\tatime") outfile.write(b"\n") outfile.write(b"# " + cmd_line() + b"\n") t0 = time.time() linenumber = 0 for line in infile: linenumber += 1 if bulk_extractor_reader.is_comment_line(line): outfile.write(line) continue try: (path,feature,context) = line[:-1].split(b'\t') except ValueError as e: print(e) print("Offending line {}:".format(linenumber),line[:-1]) continue feature_count += 1 # Increment counter if this feature was encoded if b"-" in path: features_encoded += 1 # Search for feature in database tpl = rundb.search_path(path) # Output to annotated feature file outfile.write(path) outfile.write(b'\t') outfile.write(feature) if not args.terse: outfile.write(b'\t') outfile.write(context) # If we found the data, output that if tpl: located_count += 1 outfile.write(b'\t') outfile.write(b'\t'.join(tpl[2])) # just the file info else: unallocated_count += 1 outfile.write(b'\n') if args.debug: print("path=",path,"tpl=",tpl,"located_count=",located_count) t1 = time.time() for (title,value) in [["# Total features input: {}",feature_count], ["# Total features located to files: {}",located_count], ["# Total features in unallocated space: {}",unallocated_count], ["# Total features in encoded regions: {}",features_encoded], ["# Total processing time: {:.2} seconds",t1-t0]]: outfile.write((title+"\n").format(value).encode('utf-8')) return (feature_count,located_count)
def process_featurefile(args,report,featurefile): # Counters for the summary report global file_count features = featuredb() unallocated_count = 0 feature_count = 0 features_compressed = 0 located_count = 0 unicode_encode_errors = 0 unicode_decode_errors = 0 file_count = 0 ofn = os.path.join(args.outdir,("annotated_" + featurefile )) if os.path.exists(ofn): raise RuntimeError(ofn+" exists") of = open(ofn,"wb") # First read the feature files print("Adding features from "+featurefile) try: linenumber = 0 for line in report.open(featurefile,mode='rb'): # Read the file in binary and convert to unicode if possible linenumber += 1 if bulk_extractor_reader.is_comment_line(line): continue try: fset = features.add_featurefile_line(line[0:-1]) feature_count += 1 if (b"ZIP" in fset[0]) or (b"HIBER" in fset[0]): features_compressed += 1 del fset except ValueError: raise RuntimeError("Line {} in feature file {} is invalid: {}".format(linenumber,featurefile,line)) except IOError: print("Error: Failed to open feature file '%s'" % fn) exit(1) if args.debug: print('') features.print_debug() # feature2fi is a map each feature to the file in which it was found feature2fi = {} ################################################################ # If we got features in the featuredb, find out the file that each one came from # by scanning all of the files and, for each byte run, indicating the features # that are within the byte run if features.count()>0: global filecount def process(fi): global file_count file_count += 1 if args.verbose or args.debug: print("%d %s (%d fragments)" % (file_count,fi.filename(),fi.fragments())) for run in fi.byte_runs(): for (offset,fset) in features.search(run): if args.debug: print(" run={} offset={} fset={} ".format(run,offset,fset)) feature2fi[findex(fset)] = fi # for each of those features, not that it is in this file if file_count%1000==0: print("Processed %d fileobjects in DFXML file" % file_count) xmlfile = None if args.xmlfile: xmlfile = args.xmlfile else: if args.imagefile: imagefile = args.imagefile else: imagefile = report.imagefile() # See if there is an xmlfile (root,ext) = os.path.splitext(imagefile) possible_xmlfile = root+".xml" if os.path.exists(possible_xmlfile): xmlfile = possible_xmlfile if xmlfile: print("Using XML file "+xmlfile) fiwalk.fiwalk_using_sax(xmlfile=open(xmlfile,'rb'),callback=process) else: print("Running fiwalk on " + imagefile) fiwalk.fiwalk_using_sax(imagefile=open(imagefile,'rb'),callback=process) else: print("No features found; copying feature file") ################################################################ print("Generating output...") # Now print all of the features if args.terse: of.write(b"# Position\tFeature\tFilename\n") else: of.write(b"# Position\tFeature\tContext\tFilename\tFile MD5\n") for (offset,fset) in features: try: of.write(fset[0]) # pos of.write(b"\t") of.write(fset[1]) # feature of.write(b"\t") try: if not args.terse: of.write(fset[2]) # context except IndexError: pass # no context try: fi = feature2fi[findex(fset)] of.write(b"\t") if fi.filename(): of.write(fi.filename().encode('utf-8')) if args.debug: print("pos=",offset,"feature=",fset[1],"fi=",fi,"fi.filename=",fi.filename()) if not args.terse: of.write(b"\t") if fi.md5(): of.write(fi.md5().encode('utf-8')) located_count += 1 except KeyError: unallocated_count += 1 pass # cannot locate of.write(b"\n") except UnicodeEncodeError: unicode_encode_errors += 1 of.write(b"\n") except UnicodeDecodeError: unicode_decode_errors += 1 of.write(b"\n") # stop the timer used to calculate the total run time t1 = time.time() # Summary report for (title,value) in [["# Total features input: {}",feature_count], ["# Total features located to files: {}",located_count], ["# Total features in unallocated space: {}",unallocated_count], ["# Total features in compressed regions: {}",features_compressed], ["# Unicode Encode Errors: {}",unicode_encode_errors], ["# Unicode Decode Errors: {}",unicode_decode_errors], ["# Total processing time: {:.2} seconds",t1-t0]]: of.write((title+"\n").format(value).encode('utf-8'))
def get_firstline(fn): """Returns the first line that is not a comment""" for line in b.open(fn,'rb'): if bulk_extractor_reader.is_comment_line(line): continue return line[:-1]
def analyze_outdir(outdir): """Print statistics about an output directory""" print("Analyze {}".format(outdir)) b = bulk_extractor_reader.BulkReport(outdir) print("bulk_extractor version: {}".format(b.version())) print("Image filename: {}".format(b.image_filename())) # Print which scanners were run and how long they took analyze_reportxml(b.xmldoc) hfns = list(b.histogram_files()) # histogram files print("") print("Histogram Files: {}".format(len(hfns))) def get_firstline(fn): """Returns the first line that is not a comment""" for line in b.open(fn,'rb'): if bulk_extractor_reader.is_comment_line(line): continue return line[:-1] for fn in sorted(hfns): h = b.read_histogram(fn) firstline = get_firstline(fn) if(type(firstline)==bytes and type(firstline)!=str): firstline = firstline.decode('utf-8') print(" {:>25} entries: {:>10,} (top: {})".format(fn,len(h),firstline)) fnpart = ".".join(b.image_filename().split('/')[-1].split('.')[:-1]) if b.feature_files(): ffns = sorted(list(b.feature_files())) features = {} print("") print("Feature Files: {}".format(len(ffns))) for fn in ffns: # feature files lines = 0 for line in b.open(fn,'r'): if not bulk_extractor_reader.is_comment_line(line): lines += 1 features[fn] = lines print(" {:>25} features: {:>12,} {}".format(fn,lines,analyze_warning(fnpart,fn,lines))) # If there is a SQLite database, analyze that too! if args.featurefile and args.featuresql: import sqlite3 conn = sqlite3.connect(os.path.join(outdir,"report.sqlite")) if conn: c = conn.cursor() c.execute("PRAGMA cache_size = 200000") print("Comparing SQLite3 database to feature files:") for fn in ffns: try: table = "f_"+fn.lower().replace(".txt","") cmd = "select count(*) from "+table print(cmd) c.execute(cmd); ct = c.fetchone()[0] print("{}: {} {}".format(fn,features[fn],ct)) # Now check them all to make sure that the all match count = 0 for line in b.open(fn,'r'): ary = bulk_extractor_reader.parse_feature_line(line) if ary: (path,feature) = ary[0:2] path = path.decode('utf-8') feature = feature.decode('utf-8') c.execute("select count(*) from "+table+" where path=? and feature_eutf8=?",(path,feature)) ct = c.fetchone()[0] if ct==1: #print("feature {} {} in table {} ({})".format(path,feature,table,ct)) pass if ct==0: #pass print("feature {} {} not in table {} ({})".format(path,feature,table,ct)) count += 1 if count>args.featuretest: break except sqlite3.OperationalError as e: print(e)
def analyze_outdir(outdir): """Print statistics about an output directory""" print("Analyze {}".format(outdir)) b = bulk_extractor_reader.BulkReport(outdir) print("bulk_extractor version: {}".format(b.version())) print("Image filename: {}".format(b.image_filename())) # Print which scanners were run and how long they took analyze_reportxml(b.xmldoc) hfns = list(b.histogram_files()) # histogram files print("") print("Histogram Files: {}".format(len(hfns))) def get_firstline(fn): """Returns the first line that is not a comment""" for line in b.open(fn,'rb'): if bulk_extractor_reader.is_comment_line(line): continue return line[:-1] for fn in sorted(hfns): h = b.read_histogram(fn) firstline = get_firstline(fn) if(type(firstline)==bytes and type(firstline)!=str): firstline = firstline.decode('utf-8') print(" {:>25} entries: {:>10,} (top: {})".format(fn,len(h),firstline)) fnpart = ".".join(b.image_filename().split('/')[-1].split('.')[:-1]) ffns = sorted(list(b.feature_files())) if ffns: features = {} print("") print("Feature Files: {}".format(len(ffns))) for fn in ffns: # feature files lines = 0 for line in b.open(fn,'r'): if not bulk_extractor_reader.is_comment_line(line): lines += 1 features[fn] = lines print(" {:>25} features: {:>12,} {}".format(fn,lines,analyze_warning(fnpart,fn,lines))) # If there is a SQLite database, analyze that too! if args.featurefile and args.featuresql: import sqlite3 conn = sqlite3.connect(os.path.join(outdir,"report.sqlite")) if conn: c = conn.cursor() c.execute("PRAGMA cache_size = 200000") print("Comparing SQLite3 database to feature files:") for fn in ffns: try: table = "f_"+fn.lower().replace(".txt","") cmd = "select count(*) from "+table print(cmd) c.execute(cmd); ct = c.fetchone()[0] print("{}: {} {}".format(fn,features[fn],ct)) # Now check them all to make sure that the all match count = 0 for line in b.open(fn,'r'): ary = bulk_extractor_reader.parse_feature_line(line) if ary: (path,feature) = ary[0:2] path = path.decode('utf-8') feature = feature.decode('utf-8') c.execute("select count(*) from "+table+" where path=? and feature_eutf8=?",(path,feature)) ct = c.fetchone()[0] if ct==1: #print("feature {} {} in table {} ({})".format(path,feature,table,ct)) pass if ct==0: #pass print("feature {} {} not in table {} ({})".format(path,feature,table,ct)) count += 1 if count>args.featuretest: break except sqlite3.OperationalError as e: print(e)
def process_featurefile2(rundb, infile, outfile): """Returns features from infile, determines the file for each, writes results to outfile""" # Stats unallocated_count = 0 feature_count = 0 features_encoded = 0 located_count = 0 outfile.write(b"# Position\tFeature") if not args.terse: outfile.write(b"\tContext") outfile.write(b"\tFilename\tMD5") if args.mactimes: outfile.write(b"\tcrtime\tctime\tmtime\tatime") outfile.write(b"\n") outfile.write(b"# " + cmd_line() + b"\n") t0 = time.time() linenumber = 0 for line in infile: linenumber += 1 if bulk_extractor_reader.is_comment_line(line): outfile.write(line) continue try: (path, feature, context) = line[:-1].split(b'\t') except ValueError as e: print(e) print("Offending line {}:".format(linenumber), line[:-1]) continue feature_count += 1 # Increment counter if this feature was encoded if b"-" in path: features_encoded += 1 # Search for feature in database tpl = rundb.search_path(path) # Output to annotated feature file outfile.write(path) outfile.write(b'\t') outfile.write(feature) if not args.terse: outfile.write(b'\t') outfile.write(context) # If we found the data, output that if tpl: located_count += 1 outfile.write(b'\t') outfile.write(b'\t'.join(tpl[2])) # just the file info else: unallocated_count += 1 outfile.write(b'\n') if args.debug: print("path=", path, "tpl=", tpl, "located_count=", located_count) t1 = time.time() for (title, value) in [ ["# Total features input: {}", feature_count], ["# Total features located to files: {}", located_count], ["# Total features in unallocated space: {}", unallocated_count], ["# Total features in encoded regions: {}", features_encoded], ["# Total processing time: {:.2} seconds", t1 - t0] ]: outfile.write((title + "\n").format(value).encode('utf-8')) return (feature_count, located_count)
def process_featurefile(args, report, featurefile): # Counters for the summary report global file_count features = featuredb() unallocated_count = 0 feature_count = 0 features_compressed = 0 located_count = 0 unicode_encode_errors = 0 unicode_decode_errors = 0 file_count = 0 ofn = os.path.join(args.outdir, ("annotated_" + featurefile)) if os.path.exists(ofn): raise RuntimeError(ofn + " exists") of = open(ofn, "wb") # First read the feature files print("Adding features from " + featurefile) try: linenumber = 0 for line in report.open(featurefile, mode='rb'): # Read the file in binary and convert to unicode if possible linenumber += 1 if bulk_extractor_reader.is_comment_line(line): continue try: fset = features.add_featurefile_line(line[0:-1]) feature_count += 1 if (b"ZIP" in fset[0]) or (b"HIBER" in fset[0]): features_compressed += 1 del fset except ValueError: raise RuntimeError( "Line {} in feature file {} is invalid: {}".format( linenumber, featurefile, line)) except IOError: print("Error: Failed to open feature file '%s'" % fn) exit(1) if args.debug: print('') features.print_debug() # feature2fi is a map each feature to the file in which it was found feature2fi = {} ################################################################ # If we got features in the featuredb, find out the file that each one came from # by scanning all of the files and, for each byte run, indicating the features # that are within the byte run if features.count() > 0: global filecount def process(fi): global file_count file_count += 1 if args.verbose or args.debug: print("%d %s (%d fragments)" % (file_count, fi.filename(), fi.fragments())) for run in fi.byte_runs(): for (offset, fset) in features.search(run): if args.debug: print(" run={} offset={} fset={} ".format( run, offset, fset)) feature2fi[findex( fset )] = fi # for each of those features, not that it is in this file if file_count % 1000 == 0: print("Processed %d fileobjects in DFXML file" % file_count) xmlfile = None if args.xmlfile: xmlfile = args.xmlfile else: if args.imagefile: imagefile = args.imagefile else: imagefile = report.imagefile() # See if there is an xmlfile (root, ext) = os.path.splitext(imagefile) possible_xmlfile = root + ".xml" if os.path.exists(possible_xmlfile): xmlfile = possible_xmlfile if xmlfile: print("Using XML file " + xmlfile) fiwalk.fiwalk_using_sax(xmlfile=open(xmlfile, 'rb'), callback=process) else: print("Running fiwalk on " + imagefile) fiwalk.fiwalk_using_sax(imagefile=open(imagefile, 'rb'), callback=process) else: print("No features found; copying feature file") ################################################################ print("Generating output...") # Now print all of the features if args.terse: of.write(b"# Position\tFeature\tFilename\n") else: of.write(b"# Position\tFeature\tContext\tFilename\tFile MD5\n") for (offset, fset) in features: try: of.write(fset[0]) # pos of.write(b"\t") of.write(fset[1]) # feature of.write(b"\t") try: if not args.terse: of.write(fset[2]) # context except IndexError: pass # no context try: fi = feature2fi[findex(fset)] of.write(b"\t") if fi.filename(): of.write(fi.filename().encode('utf-8')) if args.debug: print("pos=", offset, "feature=", fset[1], "fi=", fi, "fi.filename=", fi.filename()) if not args.terse: of.write(b"\t") if fi.md5(): of.write(fi.md5().encode('utf-8')) located_count += 1 except KeyError: unallocated_count += 1 pass # cannot locate of.write(b"\n") except UnicodeEncodeError: unicode_encode_errors += 1 of.write(b"\n") except UnicodeDecodeError: unicode_decode_errors += 1 of.write(b"\n") # stop the timer used to calculate the total run time t1 = time.time() # Summary report for (title, value) in [ ["# Total features input: {}", feature_count], ["# Total features located to files: {}", located_count], ["# Total features in unallocated space: {}", unallocated_count], ["# Total features in compressed regions: {}", features_compressed], ["# Unicode Encode Errors: {}", unicode_encode_errors], ["# Unicode Decode Errors: {}", unicode_decode_errors], ["# Total processing time: {:.2} seconds", t1 - t0] ]: of.write((title + "\n").format(value).encode('utf-8'))
def analyze_outdir(outdir): """Print statistics about an output directory""" print("Analyze {}".format(outdir)) b = bulk_extractor_reader.BulkReport(outdir) print("bulk_extractor version: {}".format(b.version())) print("Filename: {}".format(b.imagefile())) # Determine if any pages were not analyzed proc = dict() for work_start in b.xmldoc.getElementsByTagName("debug:work_start"): threadid = work_start.getAttribute('threadid') pos0 = work_start.getAttribute('pos0') if pos0 in proc: print("*** error: pos0={} was started by threadid {} and threadid {}".format(pos0,proc[pos0],threadid)) else: proc[pos0] = threadid for work_end in b.xmldoc.getElementsByTagName("debug:work_end"): threadid = work_end.getAttribute('threadid') pos0 = work_end.getAttribute('pos0') if pos0 not in proc: print("*** error: pos0={} was ended by threadid {} but never started!".format(pos0,threadid)) elif threadid!=proc[pos0]: print("*** error: pos0={} was ended by threadid {} but ended by threadid {}".format(pos0,proc[pos0],threadid)) else: del proc[pos0] for (pos0,threadid) in proc.items(): print("*** error: pos0={} was started by threadid {} but never ended".format(pos0,threadid)) # Print which scanners were run and how long they took scanner_times = [] scanners = b.xmldoc.getElementsByTagName("scanner_times")[0] total = 0 for path in scanners.getElementsByTagName("path"): name = path.getElementsByTagName("name")[0].firstChild.wholeText calls = int(path.getElementsByTagName("calls")[0].firstChild.wholeText) seconds = float(path.getElementsByTagName("seconds")[0].firstChild.wholeText) total += seconds scanner_times.append((name,calls,seconds)) print("Scanner paths by time and calls") scanner_times.sort(key=lambda a:a[2],reverse=True) print(" {0:>25} {1:8} {2:12} {3:12} {4:5}".format("name","calls","sec","sec/call","% total")) for (name,calls,seconds) in scanner_times: print(" {:>25} {:8.0f} {:12.4f} {:12.4f} {:5.2f}%".format( name,calls,seconds,seconds/calls,100.0*seconds/total)) hfns = list(b.histograms()) print("") print("Histogram Files: {}".format(len(hfns))) def get_firstline(fn): """Returns the first line that is not a comment""" for line in b.open(fn,'rb'): if bulk_extractor_reader.is_comment_line(line): continue return line[:-1] for fn in sorted(hfns): h = b.read_histogram(fn) firstline = get_firstline(fn) if(type(firstline)==bytes and type(firstline)!=str): firstline = firstline.decode('utf-8') print(" {:>25} entries: {:>10,} (top: {})".format(fn,len(h),firstline)) ffns = list(b.feature_files()) print("") print("Feature Files: {}".format(len(ffns))) for fn in sorted(ffns): lines = 0 for line in b.open(fn,'rb'): if not bulk_extractor_reader.is_comment_line(line): lines += 1 print(" {:>25} features: {:>10,}".format(fn,lines))