Example #1
0
def analyze_reportxml(xmldoc):
    # Determine if any pages were not analyzed
    proc = dict()
    for work_start in xmldoc.getElementsByTagName("debug:work_start"):
        threadid = work_start.getAttribute('threadid')
        pos0 = work_start.getAttribute('pos0')
        if pos0 in proc:
            print(
                "*** error: pos0={} was started by threadid {} and threadid {}"
                .format(pos0, proc[pos0], threadid))
        else:
            proc[pos0] = threadid
    for work_end in xmldoc.getElementsByTagName("debug:work_end"):
        threadid = work_end.getAttribute('threadid')
        pos0 = work_end.getAttribute('pos0')
        if pos0 not in proc:
            print(
                "*** error: pos0={} was ended by threadid {} but never started!"
                .format(pos0, threadid))
        elif threadid != proc[pos0]:
            print(
                "*** error: pos0={} was ended by threadid {} but ended by threadid {}"
                .format(pos0, proc[pos0], threadid))
        else:
            del proc[pos0]

    for (pos0, threadid) in proc.items():
        print("*** error: pos0={} was started by threadid {} but never ended".
              format(pos0, threadid))

    scanner_times = []
    scanners = xmldoc.getElementsByTagName("scanner_times")[0]
    total = 0
    for path in scanners.getElementsByTagName("path"):
        name = path.getElementsByTagName("name")[0].firstChild.wholeText
        calls = int(path.getElementsByTagName("calls")[0].firstChild.wholeText)
        seconds = float(
            path.getElementsByTagName("seconds")[0].firstChild.wholeText)
        total += seconds
        scanner_times.append((name, calls, seconds))
    print("Scanner paths by time and calls")
    scanner_times.sort(key=lambda a: a[2], reverse=True)

    print("  {0:>25}  {1:8}  {2:12}  {3:12}  {4:5}".format(
        "name", "calls", "sec", "sec/call", "% total"))
    for (name, calls, seconds) in scanner_times:
        print("  {:>25}  {:8.0f}  {:12.4f}  {:12.4f}  {:5.2f}%".format(
            name, calls, seconds, seconds / calls, 100.0 * seconds / total))
Example #2
0
def analyze_reportxml(xmldoc):
    # Determine if any pages were not analyzed
    proc = dict()
    for work_start in xmldoc.getElementsByTagName("debug:work_start"):
        threadid = work_start.getAttribute('threadid')
        pos0     = work_start.getAttribute('pos0')
        if pos0 in proc:
            print("*** error: pos0={} was started by threadid {} and threadid {}".format(pos0,proc[pos0],threadid))
        else:
            proc[pos0] = threadid
    for work_end in xmldoc.getElementsByTagName("debug:work_end"):
        threadid = work_end.getAttribute('threadid')
        pos0     = work_end.getAttribute('pos0')
        if pos0 not in proc:
            print("*** error: pos0={} was ended by threadid {} but never started!".format(pos0,threadid))
        elif threadid!=proc[pos0]:
            print("*** error: pos0={} was ended by threadid {} but ended by threadid {}".format(pos0,proc[pos0],threadid))
        else:
            del proc[pos0]
    
    for (pos0,threadid) in proc.items():
        print("*** error: pos0={} was started by threadid {} but never ended".format(pos0,threadid))
    
    scanner_times = []
    scanners = xmldoc.getElementsByTagName("scanner_times")[0]
    total = 0
    for path in scanners.getElementsByTagName("path"):
        name    = path.getElementsByTagName("name")[0].firstChild.wholeText
        calls   = int(path.getElementsByTagName("calls")[0].firstChild.wholeText)
        seconds = float(path.getElementsByTagName("seconds")[0].firstChild.wholeText)
        total   += seconds
        scanner_times.append((name,calls,seconds))
    print("Scanner paths by time and calls")
    scanner_times.sort(key=lambda a:a[2],reverse=True)

    print("  {0:>25}  {1:8}  {2:12}  {3:12}  {4:5}".format("name","calls","sec","sec/call","% total"))
    for (name,calls,seconds) in scanner_times:
        print("  {:>25}  {:8.0f}  {:12.4f}  {:12.4f}  {:5.2f}%".format(
                name,calls,seconds,seconds/calls,100.0*seconds/total))
Example #3
0
def analyze_outdir(outdir):
    """Print statistics about an output directory"""
    print("Analyze {}".format(outdir))

    b = bulk_extractor_reader.BulkReport(outdir)
    print("bulk_extractor version: {}".format(b.version()))
    print("Filename:               {}".format(b.imagefile()))
    
    # Determine if any pages were not analyzed
    proc = dict()
    for work_start in b.xmldoc.getElementsByTagName("debug:work_start"):
        threadid = work_start.getAttribute('threadid')
        pos0     = work_start.getAttribute('pos0')
        if pos0 in proc:
            print("*** error: pos0={} was started by threadid {} and threadid {}".format(pos0,proc[pos0],threadid))
        else:
            proc[pos0] = threadid
    for work_end in b.xmldoc.getElementsByTagName("debug:work_end"):
        threadid = work_end.getAttribute('threadid')
        pos0     = work_end.getAttribute('pos0')
        if pos0 not in proc:
            print("*** error: pos0={} was ended by threadid {} but never started!".format(pos0,threadid))
        elif threadid!=proc[pos0]:
            print("*** error: pos0={} was ended by threadid {} but ended by threadid {}".format(pos0,proc[pos0],threadid))
        else:
            del proc[pos0]
    
    for (pos0,threadid) in proc.items():
        print("*** error: pos0={} was started by threadid {} but never ended".format(pos0,threadid))
    
    # Print which scanners were run and how long they took
    scanner_times = []
    scanners = b.xmldoc.getElementsByTagName("scanner_times")[0]
    total = 0
    for path in scanners.getElementsByTagName("path"):
        name    = path.getElementsByTagName("name")[0].firstChild.wholeText
        calls   = int(path.getElementsByTagName("calls")[0].firstChild.wholeText)
        seconds = float(path.getElementsByTagName("seconds")[0].firstChild.wholeText)
        total   += seconds
        scanner_times.append((name,calls,seconds))
    print("Scanner paths by time and calls")
    scanner_times.sort(key=lambda a:a[2],reverse=True)

    print("  {0:>25}  {1:8}  {2:12}  {3:12}  {4:5}".format("name","calls","sec","sec/call","% total"))
    for (name,calls,seconds) in scanner_times:
        print("  {:>25}  {:8.0f}  {:12.4f}  {:12.4f}  {:5.2f}%".format(
                name,calls,seconds,seconds/calls,100.0*seconds/total))
    
    
    hfns = list(b.histograms())
    print("")
    print("Histogram Files:        {}".format(len(hfns)))

    def get_firstline(fn):
        """Returns the first line that is not a comment"""
        for line in b.open(fn,'rb'):
            if bulk_extractor_reader.is_comment_line(line):
                continue
            return line[:-1]

    for fn in sorted(hfns):
        h = b.read_histogram(fn)
        firstline = get_firstline(fn)
        if(type(firstline)==bytes and type(firstline)!=str):
            firstline = firstline.decode('utf-8')
        print("  {:>25} entries: {:>10,}  (top: {})".format(fn,len(h),firstline))

    ffns = list(b.feature_files())
    print("")
    print("Feature Files:        {}".format(len(ffns)))
    for fn in sorted(ffns):
        lines = 0
        for line in b.open(fn,'rb'):
            if not bulk_extractor_reader.is_comment_line(line):
                lines += 1
        print("  {:>25} features: {:>10,}".format(fn,lines))