Exemple #1
0
 def ingest_feature_file(self,f,context_stop_list):
     """Read the lines in a feature file; returns how many lines were procesed"""
     drivename = None
     count = 0
     for line in f:
         if type(line)==bytes:
             line = line.decode('utf-8')
         m = bulk_extractor_reader.get_property_line(line)
         if m:
             if m[0]=='Filename':
                 drivename = m[1]
                 self.drives.add(drivename)
                 print("Scanning {} for {}".format(drivename,self.name))
         if bulk_extractor_reader.is_comment_line(line):
             continue
         count += 1
         if context_stop_list != None:
             (offset,feature,context) = line.split("\t")
             context_stop_list.add((feature,context))
             continue
         feature = line.split("\t")[1]
         featuredict = self.features[feature]
         featuredict[drivename] = featuredict.get(drivename,0)+1
     print("   processed {} features".format(count))
     return count
 def ingest_feature_file(self,f,context_stop_list):
     """Read the lines in a feature file; returns how many lines were procesed"""
     drivename = None
     count = 0
     for line in f:
         if type(line)==bytes:
             line = line.decode('utf-8')
         m = bulk_extractor_reader.get_property_line(line)
         if m:
             if m[0]=='Filename':
                 drivename = m[1]
                 self.drives.add(drivename)
                 print("Scanning {} for {}".format(drivename,self.name))
         if bulk_extractor_reader.is_comment_line(line):
             continue
         count += 1
         if context_stop_list != None:
             (offset,feature,context) = line.split("\t")
             context_stop_list.add((feature,context))
             continue
         feature = line.split("\t")[1]
         featuredict = self.features[feature]
         featuredict[drivename] = featuredict.get(drivename,0)+1
     print("   processed {} features".format(count))
     return count
Exemple #3
0
def validate_file(f, kind):
    kml = "kml" in f.name
    linenumber = 0
    print("Validate UTF-8 encoding in ", f.name)
    for lineb in f:
        linenumber += 1
        lineb = lineb[:-1]  # remove the \n
        try:
            line = lineb.decode('utf-8')
        except UnicodeDecodeError as e:
            print("{}:{} {} {}".format(f.name, linenumber, str(e),
                                       asbinary(lineb)))
            continue
        if bulk_extractor_reader.is_comment_line(line):
            continue  # don't test comments
        if bulk_extractor_reader.is_histogram_line(line):
            continue  # don't test
        if kind == FEATURE_FILE:
            fields = line.split("\t")
            r = invalid_feature_file_line(line, fields)
            if r:
                print("{}: {:8} {} Invalid feature file line: {}".format(
                    f.name, linenumber, r, line))
            if kml and fields[1].count("kml") != 2:
                print("{}: {:8} Invalid KML line: {}".format(
                    f.name, linenumber, line))
def process_featurefile2(rundb,infile,outfile):
    """Returns features from infile, determines the file for each, writes results to outfile"""
    # Stats
    unallocated_count = 0
    feature_count = 0
    features_encoded = 0
    located_count = 0

    if args.terse:
        outfile.write(b"# Position\tFeature\tFilename\n")
    else:
        outfile.write(b"# Position\tFeature\tContext\tFilename\tFile MD5\n")
    t0 = time.time()
    for line in infile:
        if bulk_extractor_reader.is_comment_line(line):
            outfile.write(line)
            continue
        (offset,feature,context) = line[:-1].split(b'\t')
        feature_count += 1
        if b"-" in offset:
            ioffset = decode_path_offset(offset)
            features_encoded += 1
        else:
            ioffset = int(offset)
        tpl = rundb.search(ioffset)
        if tpl:
            located_count += 1
            fname = tpl[2].encode('utf-8') # THIS MIGHT GENERATE A UNICODE ERROR
            if tpl[3]:
                md5val = tpl[3].encode('utf-8')
            else:
                md5val = b""
        else:
            unallocated_count += 1
            fname = b""
            md5val = b""
        outfile.write(offset)
        outfile.write(b'\t')
        outfile.write(feature)
        if not args.terse:
            outfile.write(b'\t')
            outfile.write(context)
        outfile.write(b'\t')
        outfile.write(fname)
        if not args.terse:
            outfile.write(b'\t')
            outfile.write(md5val)
        outfile.write(b'\n')
    t1 = time.time()
    for (title,value) in [["# Total features input: {}",feature_count],
                          ["# Total features located to files: {}",located_count],
                          ["# Total features in unallocated space: {}",unallocated_count],
                          ["# Total features in encoded regions: {}",features_encoded],
                          ["# Total processing time: {:.2} seconds",t1-t0]]:
        outfile.write((title+"\n").format(value).encode('utf-8'))
Exemple #5
0
def bulk_extractor_ips(beoutdir):
    """
    A generator for the data lines in Bulk Extractor's ip.txt file (the IP addresses extracted from binary structures, not from text).
    """
    if not (os.path.isdir(beoutdir) and os.path.exists(os.path.join(beoutdir, "report.xml"))):
        raise ValueError("Bulk Extractor input is not a BE output directory\n\tParameter: %r." % beoutdir)
    #This BE feature file should not be opened in binary mode
    with open(os.path.join(beoutdir, "ip.txt"), "r") as ip_file:
        for line in ip_file:
            if bulk_extractor_reader.is_comment_line(line):
                continue
            line_parts = line[:-1].split("\t")
            if len(line_parts) != 3:
                raise ValueError("Bulk Extractor ip.txt file has line in unexpected format; halting to await code revisions.\n\tLine with characters escaped: %r." % line)
            yield tuple(line_parts)
def sample(outdir,fn):
    line_numbers = get_lines_array(report.open(fn,"r"))
    count = min(args.count,len(line_numbers))
    print("{} has {} lines".format(fn,len(line_numbers)))
    lines_to_sample = sorted(random.sample(line_numbers,count))
    line_number = 0
    with open(os.path.join(outdir,fn),"w") as out:
        out.write("# -*- mode:text; truncate-lines:t -*-\n")
        out.write("# Sampled {} out of {}\n".format(count,len(line_numbers)))
        out.write("# Place '=' or 'y' in front of correct classifications and '-' or 'x' in front of incorrect ones\n")
        with report.open(fn) as f:
            for line in f:
                line_number += 1
                if is_comment_line(line):
                    out.write(line.decode('utf-8'))
                if line_number in lines_to_sample:
                    out.write("{}:\t".format(line_number))
                    out.write(line.decode('utf-8'))
Exemple #7
0
 def ingest_histogram_file(self,f):
     drivename = None
     for line in f:
         if type(line)==bytes:
             line = line.decode('utf-8')
         m = bulk_extractor_reader.get_property_line(line)
         if m:
             if m[0]=='Filename':
                 drivename = m[1]
                 self.drives.add(drivename)
                 print("Scanning {} for {}".format(drivename,self.name))
             continue
         if bulk_extractor_reader.is_comment_line(line):
             continue
         fields = line.split("\t")
         count = int(fields[0][2:])
         feature = fields[1].strip()
         featuredict = self.features[feature]
         featuredict[drivename] = featuredict.get(drivename,0)+count
 def ingest_histogram_file(self,f):
     drivename = None
     for line in f:
         if type(line)==bytes:
             line = line.decode('utf-8')
         m = bulk_extractor_reader.get_property_line(line)
         if m:
             if m[0]=='Filename':
                 drivename = m[1]
                 self.drives.add(drivename)
                 print("Scanning {} for {}".format(drivename,self.name))
             continue
         if bulk_extractor_reader.is_comment_line(line):
             continue
         fields = line.split("\t")
         count = int(fields[0][2:])
         feature = fields[1].strip()
         featuredict = self.features[feature]
         featuredict[drivename] = featuredict.get(drivename,0)+count
def sample(outdir, fn):
    line_numbers = get_lines_array(report.open(fn, "r"))
    count = min(args.count, len(line_numbers))
    print("{} has {} lines".format(fn, len(line_numbers)))
    lines_to_sample = sorted(random.sample(line_numbers, count))
    line_number = 0
    with open(os.path.join(outdir, fn), "w") as out:
        out.write("# -*- mode:text; truncate-lines:t -*-\n")
        out.write("# Sampled {} out of {}\n".format(count, len(line_numbers)))
        out.write(
            "# Place '=' or 'y' in front of correct classifications and '-' or 'x' in front of incorrect ones\n"
        )
        with report.open(fn) as f:
            for line in f:
                line_number += 1
                if is_comment_line(line):
                    out.write(line.decode('utf-8'))
                if line_number in lines_to_sample:
                    out.write("{}:\t".format(line_number))
                    out.write(line.decode('utf-8'))
Exemple #10
0
def analyze_outdir(outdir):
    """Print statistics about an output directory"""
    print("Analyze {}".format(outdir))

    b = bulk_extractor_reader.BulkReport(outdir)
    print("bulk_extractor version: {}".format(b.version()))
    print("Filename:               {}".format(b.imagefile()))

    # Print which scanners were run and how long they took
    analyze_reportxml(b.xmldoc)

    hfns = list(b.histograms())
    print("")
    print("Histogram Files:        {}".format(len(hfns)))

    def get_firstline(fn):
        """Returns the first line that is not a comment"""
        for line in b.open(fn, 'rb'):
            if bulk_extractor_reader.is_comment_line(line):
                continue
            return line[:-1]

    for fn in sorted(hfns):
        h = b.read_histogram(fn)
        firstline = get_firstline(fn)
        if (type(firstline) == bytes and type(firstline) != str):
            firstline = firstline.decode('utf-8')
        print("  {:>25} entries: {:>10,}  (top: {})".format(
            fn, len(h), firstline))

    fnpart = ".".join(b.imagefile().split('/')[-1].split('.')[:-1])
    ffns = list(b.feature_files())
    print("")
    print("Feature Files:        {}".format(len(ffns)))
    for fn in sorted(ffns):
        lines = 0
        for line in b.open(fn, 'rb'):
            if not bulk_extractor_reader.is_comment_line(line):
                lines += 1
        print("  {:>25} features: {:>12,}  {}".format(
            fn, lines, analyze_warning(fnpart, fn, lines)))
Exemple #11
0
def analyze_outdir(outdir):
    """Print statistics about an output directory"""
    print("Analyze {}".format(outdir))

    b = bulk_extractor_reader.BulkReport(outdir)
    print("bulk_extractor version: {}".format(b.version()))
    print("Image filename:         {}".format(b.image_filename()))
    
    # Print which scanners were run and how long they took
    analyze_reportxml(b.xmldoc)
    
    hfns = list(b.histogram_files())
    print("")
    print("Histogram Files:        {}".format(len(hfns)))

    def get_firstline(fn):
        """Returns the first line that is not a comment"""
        for line in b.open(fn,'rb'):
            if bulk_extractor_reader.is_comment_line(line):
                continue
            return line[:-1]

    for fn in sorted(hfns):
        h = b.read_histogram(fn)
        firstline = get_firstline(fn)
        if(type(firstline)==bytes and type(firstline)!=str):
            firstline = firstline.decode('utf-8')
        print("  {:>25} entries: {:>10,}  (top: {})".format(fn,len(h),firstline))

    fnpart = ".".join(b.image_filename().split('/')[-1].split('.')[:-1])
    ffns = list(b.feature_files())
    print("")
    print("Feature Files:        {}".format(len(ffns)))
    for fn in sorted(ffns):
        lines = 0
        for line in b.open(fn,'rb'):
            if not bulk_extractor_reader.is_comment_line(line):
                lines += 1
        print("  {:>25} features: {:>12,}  {}".format(fn,lines,analyze_warning(fnpart,fn,lines)))
def get_lines_array(f):
    """Returns an array of integers corresponding to each line in the feature file"""
    line_number = 0
    line_numbers = []
    if args.pattern:
        pattern = args.pattern.encode('utf-8')
    else:
        pattern = None
    if args.xpattern:
        xpattern = args.xpattern.encode('utf-8')
    else:
        xpattern = None
    for line in f:
        line_number += 1
        if pattern and not pattern in line.split(b"\t")[0]:
            continue
        if xpattern and xpattern in line.split(b"\t")[0]:
            continue
        if is_comment_line(line):
            continue
        line_numbers.append(line_number)
    return line_numbers
def get_lines_array(f):
    """Returns an array of integers corresponding to each line in the feature file"""
    line_number = 0
    line_numbers = []
    if args.pattern:
        pattern = args.pattern.encode('utf-8')
    else:
        pattern = None
    if args.xpattern:
        xpattern = args.xpattern.encode('utf-8')
    else:
        xpattern = None
    for line in f:
        line_number += 1
        if pattern and not pattern in line.split(b"\t")[0]:
            continue
        if xpattern and xpattern in line.split(b"\t")[0]:
            continue
        if is_comment_line(line):
            continue
        line_numbers.append(line_number)
    return line_numbers
Exemple #14
0
def validate_file(f,kind):
    kml = "kml" in f.name
    linenumber = 0
    print("Validate UTF-8 encoding in ",f.name)
    for lineb in f:
        linenumber += 1
        lineb = lineb[:-1]      # remove the \n
        try:
            line = lineb.decode('utf-8')
        except UnicodeDecodeError as e:
            print("{}:{} {} {}".format(f.name,linenumber,str(e),asbinary(lineb)))
            continue
        if bulk_extractor_reader.is_comment_line(line):
            continue        # don't test comments
        if bulk_extractor_reader.is_histogram_line(line):
            continue        # don't test
        if kind==FEATURE_FILE:
            fields = line.split("\t")
            r = invalid_feature_file_line(line,fields)
            if r:
                print("{}: {:8} {} Invalid feature file line: {}".format(f.name,linenumber,r,line))
            if kml and fields[1].count("kml")!=2:
                print("{}: {:8} Invalid KML line: {}".format(f.name,linenumber,line))
Exemple #15
0
def validate_openfile(f):
    fn = f.name
    if fn.endswith('.xml') or fn.endswith('.dmp') or fn.endswith("_tags.txt") or "wordlist" in fn:
        is_feature_file = False
        return
    else:
        is_feature_file = True

    # now read
    linenumber = 0
    print("Validate ",fn)
    for lineb in f:
        linenumber += 1
        lineb = lineb[:-1]
        try:
            line = lineb.decode('utf-8')
        except UnicodeDecodeError as e:
            print("{}:{} {} {}".format(fn,linenumber,str(e),asbinary(lineb)))
        if bulk_extractor_reader.is_comment_line(line):
            continue        # don't test
        if bulk_extractor_reader.is_histogram_line(line):
            continue        # don't test
        if is_feature_file and not valid_feature_file_line(line):
            print("{}: {:8} Invalid feature file line: {}".format(fn,linenumber,line))
def process_featurefile2(rundb,infile,outfile):
    """Returns features from infile, determines the file for each, writes results to outfile"""
    # Stats
    unallocated_count = 0
    feature_count = 0
    features_encoded = 0
    located_count = 0

    outfile.write(b"# Position\tFeature")
    if not args.terse:
        outfile.write(b"\tContext")
    outfile.write(b"\tFilename\tMD5")
    if args.mactimes:
        outfile.write(b"\tcrtime\tctime\tmtime\tatime")
    outfile.write(b"\n")
    outfile.write(b"# " + cmd_line() + b"\n")
    t0 = time.time()
    linenumber = 0
    for line in infile:
        linenumber += 1
        if bulk_extractor_reader.is_comment_line(line):
            outfile.write(line)
            continue
        try:
            (path,feature,context) = line[:-1].split(b'\t')
        except ValueError as e:
            print(e)
            print("Offending line {}:".format(linenumber),line[:-1])
            continue
        feature_count += 1

        # Increment counter if this feature was encoded
        if b"-" in path:
            features_encoded += 1
        
        # Search for feature in database
        tpl = rundb.search_path(path)

        # Output to annotated feature file
        outfile.write(path)
        outfile.write(b'\t')
        outfile.write(feature)
        if not args.terse:
            outfile.write(b'\t')
            outfile.write(context)

        # If we found the data, output that
        if tpl:
            located_count += 1
            outfile.write(b'\t')
            outfile.write(b'\t'.join(tpl[2])) # just the file info
        else:
            unallocated_count += 1
        outfile.write(b'\n')

        if args.debug:
            print("path=",path,"tpl=",tpl,"located_count=",located_count)

    t1 = time.time()
    for (title,value) in [["# Total features input: {}",feature_count],
                          ["# Total features located to files: {}",located_count],
                          ["# Total features in unallocated space: {}",unallocated_count],
                          ["# Total features in encoded regions: {}",features_encoded],
                          ["# Total processing time: {:.2} seconds",t1-t0]]:
        outfile.write((title+"\n").format(value).encode('utf-8'))
    return (feature_count,located_count)
def process_featurefile(args,report,featurefile):
    # Counters for the summary report
    global file_count
    features = featuredb()
    unallocated_count = 0
    feature_count = 0
    features_compressed = 0
    located_count = 0
    unicode_encode_errors = 0
    unicode_decode_errors = 0
    file_count = 0

    ofn = os.path.join(args.outdir,("annotated_" + featurefile ))
    if os.path.exists(ofn):
        raise RuntimeError(ofn+" exists")
    of = open(ofn,"wb")

    # First read the feature files
    print("Adding features from "+featurefile)
    try:
        linenumber = 0
        for line in report.open(featurefile,mode='rb'):
            # Read the file in binary and convert to unicode if possible
            linenumber += 1
            if bulk_extractor_reader.is_comment_line(line):
                continue
            try:
                fset = features.add_featurefile_line(line[0:-1])
                feature_count += 1
                if (b"ZIP" in fset[0]) or (b"HIBER" in fset[0]):
                    features_compressed += 1
                del fset
            except ValueError:
                raise RuntimeError("Line {} in feature file {} is invalid: {}".format(linenumber,featurefile,line))
    except IOError:
         print("Error: Failed to open feature file '%s'" % fn)
         exit(1)
    
    if args.debug:
        print('')
        features.print_debug()

    # feature2fi is a map each feature to the file in which it was found
    feature2fi = {}

    ################################################################
    # If we got features in the featuredb, find out the file that each one came from
    # by scanning all of the files and, for each byte run, indicating the features
    # that are within the byte run
    if features.count()>0:
        global filecount
        def process(fi):
            global file_count
            file_count += 1
            if args.verbose or args.debug:
                print("%d %s (%d fragments)" % (file_count,fi.filename(),fi.fragments()))
            for run in fi.byte_runs():
                for (offset,fset) in features.search(run):
                    if args.debug:
                        print("  run={} offset={} fset={} ".format(run,offset,fset))
                    feature2fi[findex(fset)] = fi    # for each of those features, not that it is in this file
            if file_count%1000==0:
                print("Processed %d fileobjects in DFXML file" % file_count)

        xmlfile = None
        if args.xmlfile:
            xmlfile = args.xmlfile
        else:
            if args.imagefile:
                imagefile = args.imagefile
            else:
                imagefile = report.imagefile()
            # See if there is an xmlfile
            (root,ext) = os.path.splitext(imagefile)
            possible_xmlfile = root+".xml"
            if os.path.exists(possible_xmlfile):
                xmlfile = possible_xmlfile
        if xmlfile:
            print("Using XML file "+xmlfile)
            fiwalk.fiwalk_using_sax(xmlfile=open(xmlfile,'rb'),callback=process)
        else:
            print("Running fiwalk on " + imagefile)
            fiwalk.fiwalk_using_sax(imagefile=open(imagefile,'rb'),callback=process)
    else:
        print("No features found; copying feature file")
    ################################################################

    print("Generating output...")

    # Now print all of the features
    if args.terse:
        of.write(b"# Position\tFeature\tFilename\n")
    else:
        of.write(b"# Position\tFeature\tContext\tFilename\tFile MD5\n")
    for (offset,fset) in features:
        try:
            of.write(fset[0]) # pos
            of.write(b"\t")
            of.write(fset[1]) # feature
            of.write(b"\t")
            try:
                if not args.terse:
                    of.write(fset[2]) # context
            except IndexError:
                pass            # no context
            try:
                fi = feature2fi[findex(fset)]
                of.write(b"\t")
                if fi.filename(): of.write(fi.filename().encode('utf-8'))
                if args.debug:
                    print("pos=",offset,"feature=",fset[1],"fi=",fi,"fi.filename=",fi.filename())
                if not args.terse:
                    of.write(b"\t")
                    if fi.md5(): of.write(fi.md5().encode('utf-8'))
                located_count += 1
            except KeyError:
                unallocated_count += 1
                pass            # cannot locate
            of.write(b"\n")
        except UnicodeEncodeError:
            unicode_encode_errors += 1
            of.write(b"\n")
        except UnicodeDecodeError:
            unicode_decode_errors += 1
            of.write(b"\n")

    # stop the timer used to calculate the total run time
    t1 = time.time()

    # Summary report
    for (title,value) in [["# Total features input: {}",feature_count],
                          ["# Total features located to files: {}",located_count],
                          ["# Total features in unallocated space: {}",unallocated_count],
                          ["# Total features in compressed regions: {}",features_compressed],
                          ["# Unicode Encode Errors: {}",unicode_encode_errors],
                          ["# Unicode Decode Errors: {}",unicode_decode_errors],
                          ["# Total processing time: {:.2} seconds",t1-t0]]:
        of.write((title+"\n").format(value).encode('utf-8'))
Exemple #18
0
 def get_firstline(fn):
     """Returns the first line that is not a comment"""
     for line in b.open(fn,'rb'):
         if bulk_extractor_reader.is_comment_line(line):
             continue
         return line[:-1]
Exemple #19
0
def analyze_outdir(outdir):
    """Print statistics about an output directory"""
    print("Analyze {}".format(outdir))

    b = bulk_extractor_reader.BulkReport(outdir)
    print("bulk_extractor version: {}".format(b.version()))
    print("Image filename:         {}".format(b.image_filename()))
    
    # Print which scanners were run and how long they took
    analyze_reportxml(b.xmldoc)
    
    hfns = list(b.histogram_files()) # histogram files
    print("")
    print("Histogram Files:        {}".format(len(hfns)))

    def get_firstline(fn):
        """Returns the first line that is not a comment"""
        for line in b.open(fn,'rb'):
            if bulk_extractor_reader.is_comment_line(line):
                continue
            return line[:-1]

    for fn in sorted(hfns):
        h = b.read_histogram(fn)
        firstline = get_firstline(fn)
        if(type(firstline)==bytes and type(firstline)!=str):
            firstline = firstline.decode('utf-8')
        print("  {:>25} entries: {:>10,}  (top: {})".format(fn,len(h),firstline))

    fnpart = ".".join(b.image_filename().split('/')[-1].split('.')[:-1])

    if b.feature_files():
        ffns = sorted(list(b.feature_files()))
        features = {}
        print("")
        print("Feature Files:        {}".format(len(ffns)))
        for fn in ffns:     # feature files
            lines = 0
            for line in b.open(fn,'r'):
                if not bulk_extractor_reader.is_comment_line(line):
                    lines += 1
                    features[fn] = lines
            print("  {:>25} features: {:>12,}  {}".format(fn,lines,analyze_warning(fnpart,fn,lines)))
                    
        # If there is a SQLite database, analyze that too!
    if args.featurefile and args.featuresql:
        import sqlite3
        conn = sqlite3.connect(os.path.join(outdir,"report.sqlite"))
        if conn:
            c = conn.cursor()
            c.execute("PRAGMA cache_size = 200000")
            print("Comparing SQLite3 database to feature files:")
            for fn in ffns:
                try:
                    table = "f_"+fn.lower().replace(".txt","")
                    cmd = "select count(*) from "+table
                    print(cmd)
                    c.execute(cmd);
                    ct = c.fetchone()[0]
                    print("{}:   {}  {}".format(fn,features[fn],ct))
                    # Now check them all to make sure that the all match
                    count = 0
                    for line in b.open(fn,'r'):
                        ary = bulk_extractor_reader.parse_feature_line(line)
                        if ary:
                            (path,feature) = ary[0:2]
                            path = path.decode('utf-8')
                            feature = feature.decode('utf-8')
                            c.execute("select count(*) from "+table+" where path=? and feature_eutf8=?",(path,feature))
                            ct = c.fetchone()[0]
                            if ct==1:
                                #print("feature {} {} in table {} ({})".format(path,feature,table,ct))
                                pass
                            if ct==0:
                                #pass
                                print("feature {} {} not in table {} ({})".format(path,feature,table,ct))
                            count += 1
                            if count>args.featuretest: 
                                break

                except sqlite3.OperationalError as e:
                    print(e)
Exemple #20
0
 def get_firstline(fn):
     """Returns the first line that is not a comment"""
     for line in b.open(fn,'rb'):
         if bulk_extractor_reader.is_comment_line(line):
             continue
         return line[:-1]
Exemple #21
0
def analyze_outdir(outdir):
    """Print statistics about an output directory"""
    print("Analyze {}".format(outdir))

    b = bulk_extractor_reader.BulkReport(outdir)
    print("bulk_extractor version: {}".format(b.version()))
    print("Image filename:         {}".format(b.image_filename()))
    
    # Print which scanners were run and how long they took
    analyze_reportxml(b.xmldoc)
    
    hfns = list(b.histogram_files()) # histogram files
    print("")
    print("Histogram Files:        {}".format(len(hfns)))

    def get_firstline(fn):
        """Returns the first line that is not a comment"""
        for line in b.open(fn,'rb'):
            if bulk_extractor_reader.is_comment_line(line):
                continue
            return line[:-1]

    for fn in sorted(hfns):
        h = b.read_histogram(fn)
        firstline = get_firstline(fn)
        if(type(firstline)==bytes and type(firstline)!=str):
            firstline = firstline.decode('utf-8')
        print("  {:>25} entries: {:>10,}  (top: {})".format(fn,len(h),firstline))

    fnpart = ".".join(b.image_filename().split('/')[-1].split('.')[:-1])

    ffns = sorted(list(b.feature_files())) 
    if ffns:
        features = {}
        print("")
        print("Feature Files:        {}".format(len(ffns)))
        for fn in ffns:     # feature files
            lines = 0
            for line in b.open(fn,'r'):
                if not bulk_extractor_reader.is_comment_line(line):
                    lines += 1
                    features[fn] = lines
            print("  {:>25} features: {:>12,}  {}".format(fn,lines,analyze_warning(fnpart,fn,lines)))
                    
        # If there is a SQLite database, analyze that too!
    if args.featurefile and args.featuresql:
        import sqlite3
        conn = sqlite3.connect(os.path.join(outdir,"report.sqlite"))
        if conn:
            c = conn.cursor()
            c.execute("PRAGMA cache_size = 200000")
            print("Comparing SQLite3 database to feature files:")
            for fn in ffns:
                try:
                    table = "f_"+fn.lower().replace(".txt","")
                    cmd = "select count(*) from "+table
                    print(cmd)
                    c.execute(cmd);
                    ct = c.fetchone()[0]
                    print("{}:   {}  {}".format(fn,features[fn],ct))
                    # Now check them all to make sure that the all match
                    count = 0
                    for line in b.open(fn,'r'):
                        ary = bulk_extractor_reader.parse_feature_line(line)
                        if ary:
                            (path,feature) = ary[0:2]
                            path = path.decode('utf-8')
                            feature = feature.decode('utf-8')
                            c.execute("select count(*) from "+table+" where path=? and feature_eutf8=?",(path,feature))
                            ct = c.fetchone()[0]
                            if ct==1:
                                #print("feature {} {} in table {} ({})".format(path,feature,table,ct))
                                pass
                            if ct==0:
                                #pass
                                print("feature {} {} not in table {} ({})".format(path,feature,table,ct))
                            count += 1
                            if count>args.featuretest: 
                                break

                except sqlite3.OperationalError as e:
                    print(e)
Exemple #22
0
def process_featurefile2(rundb, infile, outfile):
    """Returns features from infile, determines the file for each, writes results to outfile"""
    # Stats
    unallocated_count = 0
    feature_count = 0
    features_encoded = 0
    located_count = 0

    outfile.write(b"# Position\tFeature")
    if not args.terse:
        outfile.write(b"\tContext")
    outfile.write(b"\tFilename\tMD5")
    if args.mactimes:
        outfile.write(b"\tcrtime\tctime\tmtime\tatime")
    outfile.write(b"\n")
    outfile.write(b"# " + cmd_line() + b"\n")
    t0 = time.time()
    linenumber = 0
    for line in infile:
        linenumber += 1
        if bulk_extractor_reader.is_comment_line(line):
            outfile.write(line)
            continue
        try:
            (path, feature, context) = line[:-1].split(b'\t')
        except ValueError as e:
            print(e)
            print("Offending line {}:".format(linenumber), line[:-1])
            continue
        feature_count += 1

        # Increment counter if this feature was encoded
        if b"-" in path:
            features_encoded += 1

        # Search for feature in database
        tpl = rundb.search_path(path)

        # Output to annotated feature file
        outfile.write(path)
        outfile.write(b'\t')
        outfile.write(feature)
        if not args.terse:
            outfile.write(b'\t')
            outfile.write(context)

        # If we found the data, output that
        if tpl:
            located_count += 1
            outfile.write(b'\t')
            outfile.write(b'\t'.join(tpl[2]))  # just the file info
        else:
            unallocated_count += 1
        outfile.write(b'\n')

        if args.debug:
            print("path=", path, "tpl=", tpl, "located_count=", located_count)

    t1 = time.time()
    for (title, value) in [
        ["# Total features input: {}", feature_count],
        ["# Total features located to files: {}", located_count],
        ["# Total features in unallocated space: {}", unallocated_count],
        ["# Total features in encoded regions: {}", features_encoded],
        ["# Total processing time: {:.2} seconds", t1 - t0]
    ]:
        outfile.write((title + "\n").format(value).encode('utf-8'))
    return (feature_count, located_count)
def process_featurefile(args, report, featurefile):
    # Counters for the summary report
    global file_count
    features = featuredb()
    unallocated_count = 0
    feature_count = 0
    features_compressed = 0
    located_count = 0
    unicode_encode_errors = 0
    unicode_decode_errors = 0
    file_count = 0

    ofn = os.path.join(args.outdir, ("annotated_" + featurefile))
    if os.path.exists(ofn):
        raise RuntimeError(ofn + " exists")
    of = open(ofn, "wb")

    # First read the feature files
    print("Adding features from " + featurefile)
    try:
        linenumber = 0
        for line in report.open(featurefile, mode='rb'):
            # Read the file in binary and convert to unicode if possible
            linenumber += 1
            if bulk_extractor_reader.is_comment_line(line):
                continue
            try:
                fset = features.add_featurefile_line(line[0:-1])
                feature_count += 1
                if (b"ZIP" in fset[0]) or (b"HIBER" in fset[0]):
                    features_compressed += 1
                del fset
            except ValueError:
                raise RuntimeError(
                    "Line {} in feature file {} is invalid: {}".format(
                        linenumber, featurefile, line))
    except IOError:
        print("Error: Failed to open feature file '%s'" % fn)
        exit(1)

    if args.debug:
        print('')
        features.print_debug()

    # feature2fi is a map each feature to the file in which it was found
    feature2fi = {}

    ################################################################
    # If we got features in the featuredb, find out the file that each one came from
    # by scanning all of the files and, for each byte run, indicating the features
    # that are within the byte run
    if features.count() > 0:
        global filecount

        def process(fi):
            global file_count
            file_count += 1
            if args.verbose or args.debug:
                print("%d %s (%d fragments)" %
                      (file_count, fi.filename(), fi.fragments()))
            for run in fi.byte_runs():
                for (offset, fset) in features.search(run):
                    if args.debug:
                        print("  run={} offset={} fset={} ".format(
                            run, offset, fset))
                    feature2fi[findex(
                        fset
                    )] = fi  # for each of those features, not that it is in this file
            if file_count % 1000 == 0:
                print("Processed %d fileobjects in DFXML file" % file_count)

        xmlfile = None
        if args.xmlfile:
            xmlfile = args.xmlfile
        else:
            if args.imagefile:
                imagefile = args.imagefile
            else:
                imagefile = report.imagefile()
            # See if there is an xmlfile
            (root, ext) = os.path.splitext(imagefile)
            possible_xmlfile = root + ".xml"
            if os.path.exists(possible_xmlfile):
                xmlfile = possible_xmlfile
        if xmlfile:
            print("Using XML file " + xmlfile)
            fiwalk.fiwalk_using_sax(xmlfile=open(xmlfile, 'rb'),
                                    callback=process)
        else:
            print("Running fiwalk on " + imagefile)
            fiwalk.fiwalk_using_sax(imagefile=open(imagefile, 'rb'),
                                    callback=process)
    else:
        print("No features found; copying feature file")
    ################################################################

    print("Generating output...")

    # Now print all of the features
    if args.terse:
        of.write(b"# Position\tFeature\tFilename\n")
    else:
        of.write(b"# Position\tFeature\tContext\tFilename\tFile MD5\n")
    for (offset, fset) in features:
        try:
            of.write(fset[0])  # pos
            of.write(b"\t")
            of.write(fset[1])  # feature
            of.write(b"\t")
            try:
                if not args.terse:
                    of.write(fset[2])  # context
            except IndexError:
                pass  # no context
            try:
                fi = feature2fi[findex(fset)]
                of.write(b"\t")
                if fi.filename(): of.write(fi.filename().encode('utf-8'))
                if args.debug:
                    print("pos=", offset, "feature=", fset[1], "fi=", fi,
                          "fi.filename=", fi.filename())
                if not args.terse:
                    of.write(b"\t")
                    if fi.md5(): of.write(fi.md5().encode('utf-8'))
                located_count += 1
            except KeyError:
                unallocated_count += 1
                pass  # cannot locate
            of.write(b"\n")
        except UnicodeEncodeError:
            unicode_encode_errors += 1
            of.write(b"\n")
        except UnicodeDecodeError:
            unicode_decode_errors += 1
            of.write(b"\n")

    # stop the timer used to calculate the total run time
    t1 = time.time()

    # Summary report
    for (title, value) in [
        ["# Total features input: {}", feature_count],
        ["# Total features located to files: {}", located_count],
        ["# Total features in unallocated space: {}", unallocated_count],
        ["# Total features in compressed regions: {}", features_compressed],
        ["# Unicode Encode Errors: {}", unicode_encode_errors],
        ["# Unicode Decode Errors: {}", unicode_decode_errors],
        ["# Total processing time: {:.2} seconds", t1 - t0]
    ]:
        of.write((title + "\n").format(value).encode('utf-8'))
Exemple #24
0
def analyze_outdir(outdir):
    """Print statistics about an output directory"""
    print("Analyze {}".format(outdir))

    b = bulk_extractor_reader.BulkReport(outdir)
    print("bulk_extractor version: {}".format(b.version()))
    print("Filename:               {}".format(b.imagefile()))
    
    # Determine if any pages were not analyzed
    proc = dict()
    for work_start in b.xmldoc.getElementsByTagName("debug:work_start"):
        threadid = work_start.getAttribute('threadid')
        pos0     = work_start.getAttribute('pos0')
        if pos0 in proc:
            print("*** error: pos0={} was started by threadid {} and threadid {}".format(pos0,proc[pos0],threadid))
        else:
            proc[pos0] = threadid
    for work_end in b.xmldoc.getElementsByTagName("debug:work_end"):
        threadid = work_end.getAttribute('threadid')
        pos0     = work_end.getAttribute('pos0')
        if pos0 not in proc:
            print("*** error: pos0={} was ended by threadid {} but never started!".format(pos0,threadid))
        elif threadid!=proc[pos0]:
            print("*** error: pos0={} was ended by threadid {} but ended by threadid {}".format(pos0,proc[pos0],threadid))
        else:
            del proc[pos0]
    
    for (pos0,threadid) in proc.items():
        print("*** error: pos0={} was started by threadid {} but never ended".format(pos0,threadid))
    
    # Print which scanners were run and how long they took
    scanner_times = []
    scanners = b.xmldoc.getElementsByTagName("scanner_times")[0]
    total = 0
    for path in scanners.getElementsByTagName("path"):
        name    = path.getElementsByTagName("name")[0].firstChild.wholeText
        calls   = int(path.getElementsByTagName("calls")[0].firstChild.wholeText)
        seconds = float(path.getElementsByTagName("seconds")[0].firstChild.wholeText)
        total   += seconds
        scanner_times.append((name,calls,seconds))
    print("Scanner paths by time and calls")
    scanner_times.sort(key=lambda a:a[2],reverse=True)

    print("  {0:>25}  {1:8}  {2:12}  {3:12}  {4:5}".format("name","calls","sec","sec/call","% total"))
    for (name,calls,seconds) in scanner_times:
        print("  {:>25}  {:8.0f}  {:12.4f}  {:12.4f}  {:5.2f}%".format(
                name,calls,seconds,seconds/calls,100.0*seconds/total))
    
    
    hfns = list(b.histograms())
    print("")
    print("Histogram Files:        {}".format(len(hfns)))

    def get_firstline(fn):
        """Returns the first line that is not a comment"""
        for line in b.open(fn,'rb'):
            if bulk_extractor_reader.is_comment_line(line):
                continue
            return line[:-1]

    for fn in sorted(hfns):
        h = b.read_histogram(fn)
        firstline = get_firstline(fn)
        if(type(firstline)==bytes and type(firstline)!=str):
            firstline = firstline.decode('utf-8')
        print("  {:>25} entries: {:>10,}  (top: {})".format(fn,len(h),firstline))

    ffns = list(b.feature_files())
    print("")
    print("Feature Files:        {}".format(len(ffns)))
    for fn in sorted(ffns):
        lines = 0
        for line in b.open(fn,'rb'):
            if not bulk_extractor_reader.is_comment_line(line):
                lines += 1
        print("  {:>25} features: {:>10,}".format(fn,lines))