Ejemplo n.º 1
0
 def __init__(self, dname1, dname2, *, out, both=False, mode='text'):
     self.b1 = bulk_extractor_reader.BulkReport(dname1)
     self.b2 = bulk_extractor_reader.BulkReport(dname2)
     self.out = out
     self.both = both
     self.mode = mode
     self.only_features = set()
     self.only_features.update(self.b1.feature_files())
     self.only_features.update(self.b2.feature_files())
Ejemplo n.º 2
0
def datacheck_checkreport(outdir):
    """Reports on whether the output in outdir matches the datacheck report"""
    print("opening ",outdir)
    b = bulk_extractor_reader.BulkReport(outdir)
    found_features = {}
    print("Feature files:",list(b.feature_files()))
    print("Histogram files:",list(b.histogram_files()))
    for fn in b.feature_files():
        print("Reading feature file {}".format(fn))
        for (pos,feature,context) in b.read_features(fn):
            found_features[pos] = feature
    print("Now reading features from data_check.txt")
    not_found = {}
    report_mismatches = False 
    found_count = 0
    for line in open("data_check.txt","rb"):
        y = bulk_extractor_reader.parse_feature_line(line)
        if y:
            (pos,feature,context) = y
            if pos in found_features:
                found_count += 1
                #print("{} found".format(pos.decode('utf-8')))
                if found_features[pos]!=feature and report_mismatches:
                    if found_features[pos]!=b'<CACHED>' and feature!=b'<CACHED>':
                        print("   {} != {}".format(feature,found_features[pos]))
            else:
                not_found[pos] = feature
    for pos in sorted(not_found):
        print("{} not found {}".format(pos,not_found[pos]))
    print("Total features found: {}".format(found_count))
    print("Total features not found: {}".format(len(not_found)))
Ejemplo n.º 3
0
def make_zip(dname):
    archive_name = dname+".zip"
    b = bulk_extractor_reader.BulkReport(dname)
    z = zipfile.ZipFile(archive_name,compression=zipfile.ZIP_DEFLATED,mode="w")
    print("Creating ZIP archive {}".format(archive_name))
    for fname in b.all_files:
        print("  adding {} ...".format(fname))
        z.write(os.path.join(dname,fname),arcname=os.path.basename(fname))
Ejemplo n.º 4
0
def validate_report(fn):
    """Make sure all of the lines in all of the files in the outdir are UTF-8 and that
    the feature files have 3 or more fields on each line.
    """
    import glob, os.path
    print("\nValidate Report: ", fn)
    res = {}
    if os.path.isdir(fn) or fn.endswith(".zip"):
        b = bulk_extractor_reader.BulkReport(fn)
        for fn in b.feature_files():
            if os.path.basename(fn) in str(args.ignore):
                print("** ignore {} **".format(fn))
                continue
            validate_file(b.open(fn, 'rb'), FEATURE_FILE)
    else:
        validate_file(open(fn, 'rb'))
Ejemplo n.º 5
0
def process(report, fsc):
    b1 = bulk_extractor_reader.BulkReport(report, do_validate=False)
    print("Reading email.txt")
    try:
        for line in b1.open("email.txt"):
            fsc.write(line)
    except KeyError:
        pass

    try:
        h = b1.read_histogram("email_histogram.txt")
        for (a) in h:
            all_emails.add(a)
    except KeyError:
        pass
    print("Processed {}; now {} unique emails".format(report, len(all_emails)))
Ejemplo n.º 6
0
 def read_filemap():
     if args.xmlfile:
         rundb.read_xmlfile(args.xmlfile)
         if len(rundb) == 0:
             raise RuntimeError(
                 "\nERROR: No files detected in XML file {}\n".format(
                     args.xmlfile))
         return
     if args.image_filename:
         imagefile = args.image_filename
     else:
         imagefile = bulk_extractor_reader.BulkReport(
             args.bulk_extractor_report).image_filename()
     rundb.read_imagefile(imagefile)
     if len(rundb) == 0:
         raise RuntimeError(
             "\nERROR: No files detected in image file {}\n".format(
                 imagefile))
Ejemplo n.º 7
0
def analyze_outdir(outdir):
    """Print statistics about an output directory"""
    print("Analyze {}".format(outdir))

    b = bulk_extractor_reader.BulkReport(outdir)
    print("bulk_extractor version: {}".format(b.version()))
    print("Filename:               {}".format(b.imagefile()))

    # Print which scanners were run and how long they took
    analyze_reportxml(b.xmldoc)

    hfns = list(b.histograms())
    print("")
    print("Histogram Files:        {}".format(len(hfns)))

    def get_firstline(fn):
        """Returns the first line that is not a comment"""
        for line in b.open(fn, 'rb'):
            if bulk_extractor_reader.is_comment_line(line):
                continue
            return line[:-1]

    for fn in sorted(hfns):
        h = b.read_histogram(fn)
        firstline = get_firstline(fn)
        if (type(firstline) == bytes and type(firstline) != str):
            firstline = firstline.decode('utf-8')
        print("  {:>25} entries: {:>10,}  (top: {})".format(
            fn, len(h), firstline))

    fnpart = ".".join(b.imagefile().split('/')[-1].split('.')[:-1])
    ffns = list(b.feature_files())
    print("")
    print("Feature Files:        {}".format(len(ffns)))
    for fn in sorted(ffns):
        lines = 0
        for line in b.open(fn, 'rb'):
            if not bulk_extractor_reader.is_comment_line(line):
                lines += 1
        print("  {:>25} features: {:>12,}  {}".format(
            fn, lines, analyze_warning(fnpart, fn, lines)))
Ejemplo n.º 8
0
def process(out,dname1,dname2):
    mode = 'text'
    if options.html: mode='html'

    b1 = bulk_extractor_reader.BulkReport(dname1)
    b2 = bulk_extractor_reader.BulkReport(dname2)

    t = ttable.ttable()
    t.append_data(['bulk_diff.py Version:',bulk_diff_version])
    t.append_data(['PRE Image:',b1.image_filename()])
    t.append_data(['POST Image:',b2.image_filename()])
    out.write(t.typeset(mode=mode))

    for i in [1,2]:
        if i==1:
            a=b1;b=b2
        else:
            b=b1;a=b2;
        r = a.files.difference(b.files)
        if r:
            print("Files only in {}:".format(a.name))
            for f in r:
                if ".txt" in f:
                    print("     %s (%d lines)" % (f,a.count_lines(f)))
                else:
                    print("     %s" % (f))

    # Report interesting differences based on the historgrams.
    # Output Example:
        """
# in PRE     # in POST      ∆      Feature
10           20            10      [email protected]
 8           17             9      [email protected]
11           16             5      [email protected]
"""
    b1_histograms = set(b1.histogram_files())
    b2_histograms = set(b2.histogram_files())
    common_histograms = b1_histograms.intersection(b2_histograms)
    
    if options.html:
        out.write("<ul>\n")
        for histogram_file in sorted(histogram_files):
            out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file,histogram_file))
        out.write("</ul>\n<hr/>\n")

    for histogram_file in sorted(common_histograms):
        diffcount = 0
        if options.html:
            out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file,histogram_file))
        else:
            out.write('\n')
        t = ttable.ttable()
        t.set_col_alignment(0,t.RIGHT)
        t.set_col_alignment(1,t.RIGHT)
        t.set_col_alignment(2,t.RIGHT)
        t.set_col_alignment(3,t.LEFT)
        t.set_title(histogram_file)
        t.append_head(['# in PRE','# in POST','∆','Value'])

        b1.hist = b1.read_histogram(histogram_file)
        b2.hist = b2.read_histogram(histogram_file)
        b1.keys = set(b1.hist.keys())
        b2.keys = set(b2.hist.keys())

        # Create the output, then we will sort on col 1, 2 and 4
        data = []
        for feature in b1.keys.union(b2.keys):
            v1 = b1.hist.get(feature,0)
            v2 = b2.hist.get(feature,0)
            if v1!=v2: diffcount += 1
            if v2>v1 or (v2==v1 and options.same) or (v2<v1 and options.smaller):
                data.append((v1, v2, v2-v1, feature.decode('utf-8')))

        # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value
        def mysortkey(a):
            return (-a[2],a[3],a[1],a[0])

        if data:
            for row in sorted(data,key=mysortkey):
                t.append_data(row)
            out.write(t.typeset(mode=mode))
        if diffcount==0:
            if options.html:
                out.write("{}: No differences\n".format(histogram_file))
            else:
                out.write("{}: No differences\n".format(histogram_file))

            
    if options.features:
        for feature_file in b1.feature_files():
            if feature_file not in b2.feature_files():
                continue
            print("Compare features",feature_file)
            for p in [1,2]:
                if p==1:
                    a = b1; b = b2
                else:
                    a = b2; b = a
                a_features = {}
                for line in a.open(feature_file):
                    r = bulk_extractor_reader.parse_feature_line(line)
                    if not r: continue
                    a_features[r[0]] = r[1]
                for line in b.open(feature_file):
                    r = bulk_extractor_reader.parse_feature_line(line)
                    if not r: continue
                    if r[0] not in a_features:
                        print("{} {} is only in {}".format(r[0],r[1],a.name))
Ejemplo n.º 9
0
def ingest(report_fn):
    import time
    c = conn.cursor()
    c.execute("select count(*) from drives where report_fn=?",(report_fn,))
    if c.fetchone()[0]>0 and args.reimport==False:
        print("{} already imported".format(report_fn))
        return

    try:
        br = bulk_extractor_reader.BulkReport(report_fn)
        image_filename = br.image_filename()
    except IndexError:
        print("No image filename in bulk_extractor report for {}; will not ingest".format(report_fn))
        return
    except OSError:
        print("Cannot open {}; will not ingest".format(report_fn))
        return
    except KeyError:
        print("Cannot open {}; will not ingest".format(report_fn))
        return

    if args.reimport==False:
        driveid = get_driveid(image_filename,create=False)
        if driveid:
            print("{} already imported".format(image_filename))
            return

    driveid = get_driveid(image_filename,report_fn,create=True)
    print("Ingesting {} as driveid {}".format(br.image_filename(),driveid))
    t0 = time.time()
    
    if args.reimport:
        # Make sure that this driveid is not in the feature tables
        c.execute("DELETE FROM feature_drive_counts where driveid=?",(driveid,))

    # initial version we are ingesting search terms, winpe executables, and email addresses
    for (search,count) in br.read_histogram_entries("url_searches.txt"):
        if search.startswith(b"cache:"): continue  
        featureid = get_featureid(search);
        c.execute("INSERT INTO feature_drive_counts (driveid,feature_type,featureid,count) values (?,?,?,?);",
                  (driveid,SEARCH_TYPE,featureid,count))
        
    # Add counts for email addresses
    for (email,count) in br.read_histogram_entries("email_histogram.txt"):
        #print("Add email {} = {}".format(email,count))
        featureid = get_featureid(email);
        c.execute("INSERT INTO feature_drive_counts (driveid,feature_type,featureid,count) values (?,?,?,?);",
                  (driveid,EMAIL_TYPE,featureid,count))

    # Add hashes for Windows executables
    import collections
    pe_header_counts = collections.Counter()
    for r in br.read_features("winpe.txt"):
        try:
            (pos,feature,context) = r
            featureid = get_featureid(feature)
            pe_header_counts[featureid] += 1
        except ValueError:
            print("got {} values".format(len(r)))
    for (featureid,count) in pe_header_counts.items():
        c.execute("INSERT INTO feature_drive_counts (driveid,feature_type,featureid,count) values (?,?,?,?);",
                  (driveid,WINPE_TYPE,featureid,count))
    conn.commit()
    t1 = time.time()
    print("Driveid {} imported in {} seconds\n".format(driveid,t1-t0))
Ejemplo n.º 10
0
def analyze_outdir(outdir):
    """Print statistics about an output directory"""
    print("Analyze {}".format(outdir))

    b = bulk_extractor_reader.BulkReport(outdir)
    print("bulk_extractor version: {}".format(b.version()))
    print("Image filename:         {}".format(b.image_filename()))
    
    # Print which scanners were run and how long they took
    analyze_reportxml(b.xmldoc)
    
    hfns = list(b.histogram_files()) # histogram files
    print("")
    print("Histogram Files:        {}".format(len(hfns)))

    def get_firstline(fn):
        """Returns the first line that is not a comment"""
        for line in b.open(fn,'rb'):
            if bulk_extractor_reader.is_comment_line(line):
                continue
            return line[:-1]

    for fn in sorted(hfns):
        h = b.read_histogram(fn)
        firstline = get_firstline(fn)
        if(type(firstline)==bytes and type(firstline)!=str):
            firstline = firstline.decode('utf-8')
        print("  {:>25} entries: {:>10,}  (top: {})".format(fn,len(h),firstline))

    fnpart = ".".join(b.image_filename().split('/')[-1].split('.')[:-1])

    ffns = sorted(list(b.feature_files())) 
    if ffns:
        features = {}
        print("")
        print("Feature Files:        {}".format(len(ffns)))
        for fn in ffns:     # feature files
            lines = 0
            for line in b.open(fn,'r'):
                if not bulk_extractor_reader.is_comment_line(line):
                    lines += 1
                    features[fn] = lines
            print("  {:>25} features: {:>12,}  {}".format(fn,lines,analyze_warning(fnpart,fn,lines)))
                    
        # If there is a SQLite database, analyze that too!
    if args.featurefile and args.featuresql:
        import sqlite3
        conn = sqlite3.connect(os.path.join(outdir,"report.sqlite"))
        if conn:
            c = conn.cursor()
            c.execute("PRAGMA cache_size = 200000")
            print("Comparing SQLite3 database to feature files:")
            for fn in ffns:
                try:
                    table = "f_"+fn.lower().replace(".txt","")
                    cmd = "select count(*) from "+table
                    print(cmd)
                    c.execute(cmd);
                    ct = c.fetchone()[0]
                    print("{}:   {}  {}".format(fn,features[fn],ct))
                    # Now check them all to make sure that the all match
                    count = 0
                    for line in b.open(fn,'r'):
                        ary = bulk_extractor_reader.parse_feature_line(line)
                        if ary:
                            (path,feature) = ary[0:2]
                            path = path.decode('utf-8')
                            feature = feature.decode('utf-8')
                            c.execute("select count(*) from "+table+" where path=? and feature_eutf8=?",(path,feature))
                            ct = c.fetchone()[0]
                            if ct==1:
                                #print("feature {} {} in table {} ({})".format(path,feature,table,ct))
                                pass
                            if ct==0:
                                #pass
                                print("feature {} {} not in table {} ({})".format(path,feature,table,ct))
                            count += 1
                            if count>args.featuretest: 
                                break

                except sqlite3.OperationalError as e:
                    print(e)
Ejemplo n.º 11
0
 def process(self):
     ber = bulk_extractor_reader.BulkReport(self.fn,do_validate=False)
     for ff in ber.feature_files():
         if ff in ignored_features: continue
         print("Processing {} in {}".format(ff,self.fn))
         self.process_feature_file(ber,ff)
Ejemplo n.º 12
0
def process(out, dname1, dname2):
    mode = 'text'
    if options.html: mode = 'html'

    b1 = bulk_extractor_reader.BulkReport(dname1)
    b2 = bulk_extractor_reader.BulkReport(dname2)

    t = ttable.ttable()
    t.append_data(['bulk_diff.py Version:', bulk_diff_version])
    t.append_data(['PRE Image:', b1.image_filename()])
    t.append_data(['POST Image:', b2.image_filename()])
    out.write(t.typeset(mode=mode))

    if b1.files.difference(b2.files):
        print("Files only in %s:\n   %s" %
              (b1.name, " ".join(b1.files.difference(b2.files))))
    if b2.files.difference(b1.files):
        print("Files only in %s:\n   %s" %
              (b2.name, " ".join(b2.files.difference(b1.files))))

        # Report interesting differences based on the historgrams.
        # Output Example:
        """
# in PRE     # in POST      ∆      Feature
10           20            10      [email protected]
 8           17             9      [email protected]
11           16             5      [email protected]
"""
    common_files = b1.files.intersection(b2.files)
    histogram_files = filter(lambda a: "histogram" in a, common_files)

    if options.html:
        out.write("<ul>\n")
        for histogram_file in sorted(histogram_files):
            out.write("<li><a href='#%s'>%s</a></li>\n" %
                      (histogram_file, histogram_file))
        out.write("</ul>\n<hr/>\n")

    diffcount = 0
    for histogram_file in sorted(histogram_files):
        if options.html:
            out.write('<h2><a name="%s">%s</a></h2>\n' %
                      (histogram_file, histogram_file))
        else:
            out.write('\n')
        t = ttable.ttable()
        t.set_col_alignment(0, t.RIGHT)
        t.set_col_alignment(1, t.RIGHT)
        t.set_col_alignment(2, t.RIGHT)
        t.set_col_alignment(3, t.LEFT)
        t.set_title(histogram_file)
        t.append_head(['# in PRE', '# in POST', '∆', 'Value'])

        b1.hist = b1.read_histogram(histogram_file)
        b2.hist = b2.read_histogram(histogram_file)
        b1.keys = set(b1.hist.keys())
        b2.keys = set(b2.hist.keys())

        # Create the output, then we will sort on col 1, 2 and 4
        data = []
        for feature in b1.keys.union(b2.keys):
            v1 = b1.hist.get(feature, 0)
            v2 = b2.hist.get(feature, 0)
            if v1 != v2: diffcount += 1
            if v2 > v1 or (v2 == v1 and options.same) or (v2 < v1
                                                          and options.smaller):
                data.append((v1, v2, v2 - v1, feature))

        # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value
        def mysortkey(a):
            return (-a[2], a[3], a[1], a[0])

        if data:
            for row in sorted(data, key=mysortkey):
                t.append_data(row)
            out.write(t.typeset(mode=mode))
        if diffcount == 0:
            if options.html:
                out.write("{}: No differences\n\n".format(histogram_file))
            else:
                out.write("{}: No differences\n\n".format(histogram_file))
Ejemplo n.º 13
0
                    imagefile))

    if args.path:
        read_filemap()
        print("Locating {}: ".format(args.path))
        res = rundb.search_path(args.path.encode('utf-8'))
        if res:
            print("Start:     {}\nLength:    {}\nFile Name: {}\nFile MD5:  {}".
                  format(res[0], res[1], res[2], res[3]))
        else:
            print("NOT FOUND")
        exit(0)

    # Open the report
    try:
        report = bulk_extractor_reader.BulkReport(args.bulk_extractor_report)
    except UnicodeDecodeError:
        print("{}/report.xml file contains invalid XML. Cannot continue\n".
              format(args.bulk_extractor_report))
        exit(1)

    if args.list:
        print("Feature files in {}:".format(args.bulk_extractor_report))
        for fn in report.feature_files():
            print(fn)
        exit(1)

    # Make sure that the user has specified feature files
    if not args.featurefiles and not args.all:
        raise RuntimeError(
            "Please request a specific feature file or --all feature files")
Ejemplo n.º 14
0
    # Create the correlators, one for each feature file
    correlators = set()
    for name in args.idfeatures.split(","):
        correlators.add(Correlator(name))
        
    # Create the br readers, one for each report
    br_readers  = set()
    for fname in args.reports:
        # On windows the '*' may not be expanded....
        if '*' in fname:
            fns = glob.glob(fname)
        else:
            fns = [fname]
        for fn in fns:
            try:
                br_readers.add(bulk_extractor_reader.BulkReport(fn))
            except IOError:
                print("{} is an invalid bulk_extractor report. Cannot continue. STOP.\n".format(fn))
                exit(1)

    # Now read each feature file from each reader
    # Either ingest (in the case of cda) or create the context stop list (if making combined)
    for c in correlators:
        context_stop_list = set()
        for br in br_readers:
            b = br.open(c.name+".txt",mode='r')
            if args.makecombined:
                count = c.ingest_feature_file(b,context_stop_list)
            else:
                count = c.ingest_feature_file(b,None)
        if args.makecombined: