def analyze_reportxml(xmldoc): # Determine if any pages were not analyzed proc = dict() for work_start in xmldoc.getElementsByTagName("debug:work_start"): threadid = work_start.getAttribute('threadid') pos0 = work_start.getAttribute('pos0') if pos0 in proc: print( "*** error: pos0={} was started by threadid {} and threadid {}" .format(pos0, proc[pos0], threadid)) else: proc[pos0] = threadid for work_end in xmldoc.getElementsByTagName("debug:work_end"): threadid = work_end.getAttribute('threadid') pos0 = work_end.getAttribute('pos0') if pos0 not in proc: print( "*** error: pos0={} was ended by threadid {} but never started!" .format(pos0, threadid)) elif threadid != proc[pos0]: print( "*** error: pos0={} was ended by threadid {} but ended by threadid {}" .format(pos0, proc[pos0], threadid)) else: del proc[pos0] for (pos0, threadid) in proc.items(): print("*** error: pos0={} was started by threadid {} but never ended". format(pos0, threadid)) scanner_times = [] scanners = xmldoc.getElementsByTagName("scanner_times")[0] total = 0 for path in scanners.getElementsByTagName("path"): name = path.getElementsByTagName("name")[0].firstChild.wholeText calls = int(path.getElementsByTagName("calls")[0].firstChild.wholeText) seconds = float( path.getElementsByTagName("seconds")[0].firstChild.wholeText) total += seconds scanner_times.append((name, calls, seconds)) print("Scanner paths by time and calls") scanner_times.sort(key=lambda a: a[2], reverse=True) print(" {0:>25} {1:8} {2:12} {3:12} {4:5}".format( "name", "calls", "sec", "sec/call", "% total")) for (name, calls, seconds) in scanner_times: print(" {:>25} {:8.0f} {:12.4f} {:12.4f} {:5.2f}%".format( name, calls, seconds, seconds / calls, 100.0 * seconds / total))
def analyze_reportxml(xmldoc): # Determine if any pages were not analyzed proc = dict() for work_start in xmldoc.getElementsByTagName("debug:work_start"): threadid = work_start.getAttribute('threadid') pos0 = work_start.getAttribute('pos0') if pos0 in proc: print("*** error: pos0={} was started by threadid {} and threadid {}".format(pos0,proc[pos0],threadid)) else: proc[pos0] = threadid for work_end in xmldoc.getElementsByTagName("debug:work_end"): threadid = work_end.getAttribute('threadid') pos0 = work_end.getAttribute('pos0') if pos0 not in proc: print("*** error: pos0={} was ended by threadid {} but never started!".format(pos0,threadid)) elif threadid!=proc[pos0]: print("*** error: pos0={} was ended by threadid {} but ended by threadid {}".format(pos0,proc[pos0],threadid)) else: del proc[pos0] for (pos0,threadid) in proc.items(): print("*** error: pos0={} was started by threadid {} but never ended".format(pos0,threadid)) scanner_times = [] scanners = xmldoc.getElementsByTagName("scanner_times")[0] total = 0 for path in scanners.getElementsByTagName("path"): name = path.getElementsByTagName("name")[0].firstChild.wholeText calls = int(path.getElementsByTagName("calls")[0].firstChild.wholeText) seconds = float(path.getElementsByTagName("seconds")[0].firstChild.wholeText) total += seconds scanner_times.append((name,calls,seconds)) print("Scanner paths by time and calls") scanner_times.sort(key=lambda a:a[2],reverse=True) print(" {0:>25} {1:8} {2:12} {3:12} {4:5}".format("name","calls","sec","sec/call","% total")) for (name,calls,seconds) in scanner_times: print(" {:>25} {:8.0f} {:12.4f} {:12.4f} {:5.2f}%".format( name,calls,seconds,seconds/calls,100.0*seconds/total))
def analyze_outdir(outdir): """Print statistics about an output directory""" print("Analyze {}".format(outdir)) b = bulk_extractor_reader.BulkReport(outdir) print("bulk_extractor version: {}".format(b.version())) print("Filename: {}".format(b.imagefile())) # Determine if any pages were not analyzed proc = dict() for work_start in b.xmldoc.getElementsByTagName("debug:work_start"): threadid = work_start.getAttribute('threadid') pos0 = work_start.getAttribute('pos0') if pos0 in proc: print("*** error: pos0={} was started by threadid {} and threadid {}".format(pos0,proc[pos0],threadid)) else: proc[pos0] = threadid for work_end in b.xmldoc.getElementsByTagName("debug:work_end"): threadid = work_end.getAttribute('threadid') pos0 = work_end.getAttribute('pos0') if pos0 not in proc: print("*** error: pos0={} was ended by threadid {} but never started!".format(pos0,threadid)) elif threadid!=proc[pos0]: print("*** error: pos0={} was ended by threadid {} but ended by threadid {}".format(pos0,proc[pos0],threadid)) else: del proc[pos0] for (pos0,threadid) in proc.items(): print("*** error: pos0={} was started by threadid {} but never ended".format(pos0,threadid)) # Print which scanners were run and how long they took scanner_times = [] scanners = b.xmldoc.getElementsByTagName("scanner_times")[0] total = 0 for path in scanners.getElementsByTagName("path"): name = path.getElementsByTagName("name")[0].firstChild.wholeText calls = int(path.getElementsByTagName("calls")[0].firstChild.wholeText) seconds = float(path.getElementsByTagName("seconds")[0].firstChild.wholeText) total += seconds scanner_times.append((name,calls,seconds)) print("Scanner paths by time and calls") scanner_times.sort(key=lambda a:a[2],reverse=True) print(" {0:>25} {1:8} {2:12} {3:12} {4:5}".format("name","calls","sec","sec/call","% total")) for (name,calls,seconds) in scanner_times: print(" {:>25} {:8.0f} {:12.4f} {:12.4f} {:5.2f}%".format( name,calls,seconds,seconds/calls,100.0*seconds/total)) hfns = list(b.histograms()) print("") print("Histogram Files: {}".format(len(hfns))) def get_firstline(fn): """Returns the first line that is not a comment""" for line in b.open(fn,'rb'): if bulk_extractor_reader.is_comment_line(line): continue return line[:-1] for fn in sorted(hfns): h = b.read_histogram(fn) firstline = get_firstline(fn) if(type(firstline)==bytes and type(firstline)!=str): firstline = firstline.decode('utf-8') print(" {:>25} entries: {:>10,} (top: {})".format(fn,len(h),firstline)) ffns = list(b.feature_files()) print("") print("Feature Files: {}".format(len(ffns))) for fn in sorted(ffns): lines = 0 for line in b.open(fn,'rb'): if not bulk_extractor_reader.is_comment_line(line): lines += 1 print(" {:>25} features: {:>10,}".format(fn,lines))