Python parse_feature_line Examples, bulk_extractor_reader.parse_feature_line Python Examples

Example #1

0

Show file

File: bulk_diff.py Project: jonstewart/bulk_extractor

 def compare_features(self):
     out = self.out
     for feature_file in self.b1.feature_files():
         if feature_file not in self.only_features:
             continue
         if feature_file not in self.b2.feature_files():
             continue
         print("Compare features",feature_file)
         if self.both:
             (a,b) = self.getab()
             a_offsets = set([bulk_extractor_reader.parse_feature_line(line)[0] for line in a.open(feature_file) if line[0]!=35])
             b_offsets = set([bulk_extractor_reader.parse_feature_line(line)[0] for line in b.open(feature_file) if line[0]!=35])
             common = a_offsets.intersection(b_offsets)
             for line in a.open(feature_file):
                 r = bulk_extractor_reader.parse_feature_line(line)
                 if r and r[0] in common:
                     print("{} {} IN BOTH".format(r[0].decode('utf-8'),r[1].decode('utf-8')), file=out)
         # differences
         for p in [1,2]:
             (a,b) = self.getab(p)
             a_features = {}
             for line in a.open(feature_file):
                 r = bulk_extractor_reader.parse_feature_line(line)
                 if not r: continue
                 a_features[r[0]] = r[1]
             for line in b.open(feature_file):
                 r = bulk_extractor_reader.parse_feature_line(line)
                 if not r: continue
                 if r[0] not in a_features:
                     print("{} {} is only in {}".format(r[0].decode('utf-8'),r[1].decode('utf-8'),b.name), file=out)

Example #2

0

Show file

def datacheck_checkreport(outdir):
    """Reports on whether the output in outdir matches the datacheck report"""
    print("opening ",outdir)
    b = bulk_extractor_reader.BulkReport(outdir)
    found_features = {}
    print("Feature files:",list(b.feature_files()))
    print("Histogram files:",list(b.histogram_files()))
    for fn in b.feature_files():
        print("Reading feature file {}".format(fn))
        for (pos,feature,context) in b.read_features(fn):
            found_features[pos] = feature
    print("Now reading features from data_check.txt")
    not_found = {}
    report_mismatches = False 
    found_count = 0
    for line in open("data_check.txt","rb"):
        y = bulk_extractor_reader.parse_feature_line(line)
        if y:
            (pos,feature,context) = y
            if pos in found_features:
                found_count += 1
                #print("{} found".format(pos.decode('utf-8')))
                if found_features[pos]!=feature and report_mismatches:
                    if found_features[pos]!=b'<CACHED>' and feature!=b'<CACHED>':
                        print("   {} != {}".format(feature,found_features[pos]))
            else:
                not_found[pos] = feature
    for pos in sorted(not_found):
        print("{} not found {}".format(pos,not_found[pos]))
    print("Total features found: {}".format(found_count))
    print("Total features not found: {}".format(len(not_found)))

Example #3

0

Show file

File: regress.py Project: LucaBongiorni/bulk_extractor

def datacheckreport(outdir):
    # Open the datadir
    print("opening ",outdir)
    b = bulk_extractor_reader.BulkReport(outdir)
    found_features = {}
    print("Feature files:",list(b.feature_files()))
    print("Histogram files:",list(b.histogram_files()))
    for fn in b.feature_files():
        print("Reading feature file {}".format(fn))
        for (pos,feature,context) in b.read_features(fn):
            found_features[pos] = feature
    print("Now reading features from data_check.txt")
    not_found = {}
    report_mismatches = False 
    found_count = 0
    for line in open("data_check.txt","rb"):
        y = bulk_extractor_reader.parse_feature_line(line)
        if y:
            (pos,feature,context) = y
            if pos in found_features:
                found_count += 1
                #print("{} found".format(pos.decode('utf-8')))
                if found_features[pos]!=feature and report_mismatches:
                    if found_features[pos]!=b'<CACHED>' and feature!=b'<CACHED>':
                        print("   {} != {}".format(feature,found_features[pos]))
            else:
                not_found[pos] = feature
    for pos in sorted(not_found):
        print("{} not found {}".format(pos,not_found[pos]))
    print("Total features found: {}".format(found_count))
    print("Total features not found: {}".format(len(not_found)))

Example #4

0

Show file

def process(out,dname1,dname2):
    mode = 'text'
    if options.html: mode='html'

    b1 = bulk_extractor_reader.BulkReport(dname1)
    b2 = bulk_extractor_reader.BulkReport(dname2)

    t = ttable.ttable()
    t.append_data(['bulk_diff.py Version:',bulk_diff_version])
    t.append_data(['PRE Image:',b1.image_filename()])
    t.append_data(['POST Image:',b2.image_filename()])
    out.write(t.typeset(mode=mode))

    for i in [1,2]:
        if i==1:
            a=b1;b=b2
        else:
            b=b1;a=b2;
        r = a.files.difference(b.files)
        if r:
            print("Files only in {}:".format(a.name))
            for f in r:
                if ".txt" in f:
                    print("     %s (%d lines)" % (f,a.count_lines(f)))
                else:
                    print("     %s" % (f))

    # Report interesting differences based on the historgrams.
    # Output Example:
        """
# in PRE     # in POST      ∆      Feature
10           20            10      [email protected]
 8           17             9      [email protected]
11           16             5      [email protected]
"""
    b1_histograms = set(b1.histogram_files())
    b2_histograms = set(b2.histogram_files())
    common_histograms = b1_histograms.intersection(b2_histograms)
    
    if options.html:
        out.write("<ul>\n")
        for histogram_file in sorted(histogram_files):
            out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file,histogram_file))
        out.write("</ul>\n<hr/>\n")

    for histogram_file in sorted(common_histograms):
        diffcount = 0
        if options.html:
            out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file,histogram_file))
        else:
            out.write('\n')
        t = ttable.ttable()
        t.set_col_alignment(0,t.RIGHT)
        t.set_col_alignment(1,t.RIGHT)
        t.set_col_alignment(2,t.RIGHT)
        t.set_col_alignment(3,t.LEFT)
        t.set_title(histogram_file)
        t.append_head(['# in PRE','# in POST','∆','Value'])

        b1.hist = b1.read_histogram(histogram_file)
        b2.hist = b2.read_histogram(histogram_file)
        b1.keys = set(b1.hist.keys())
        b2.keys = set(b2.hist.keys())

        # Create the output, then we will sort on col 1, 2 and 4
        data = []
        for feature in b1.keys.union(b2.keys):
            v1 = b1.hist.get(feature,0)
            v2 = b2.hist.get(feature,0)
            if v1!=v2: diffcount += 1
            if v2>v1 or (v2==v1 and options.same) or (v2<v1 and options.smaller):
                data.append((v1, v2, v2-v1, feature.decode('utf-8')))

        # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value
        def mysortkey(a):
            return (-a[2],a[3],a[1],a[0])

        if data:
            for row in sorted(data,key=mysortkey):
                t.append_data(row)
            out.write(t.typeset(mode=mode))
        if diffcount==0:
            if options.html:
                out.write("{}: No differences\n".format(histogram_file))
            else:
                out.write("{}: No differences\n".format(histogram_file))

            
    if options.features:
        for feature_file in b1.feature_files():
            if feature_file not in b2.feature_files():
                continue
            print("Compare features",feature_file)
            for p in [1,2]:
                if p==1:
                    a = b1; b = b2
                else:
                    a = b2; b = a
                a_features = {}
                for line in a.open(feature_file):
                    r = bulk_extractor_reader.parse_feature_line(line)
                    if not r: continue
                    a_features[r[0]] = r[1]
                for line in b.open(feature_file):
                    r = bulk_extractor_reader.parse_feature_line(line)
                    if not r: continue
                    if r[0] not in a_features:
                        print("{} {} is only in {}".format(r[0],r[1],a.name))

Example #5

0

Show file

File: regress.py Project: mattv1/bulk_extractor

def analyze_outdir(outdir):
    """Print statistics about an output directory"""
    print("Analyze {}".format(outdir))

    b = bulk_extractor_reader.BulkReport(outdir)
    print("bulk_extractor version: {}".format(b.version()))
    print("Image filename:         {}".format(b.image_filename()))
    
    # Print which scanners were run and how long they took
    analyze_reportxml(b.xmldoc)
    
    hfns = list(b.histogram_files()) # histogram files
    print("")
    print("Histogram Files:        {}".format(len(hfns)))

    def get_firstline(fn):
        """Returns the first line that is not a comment"""
        for line in b.open(fn,'rb'):
            if bulk_extractor_reader.is_comment_line(line):
                continue
            return line[:-1]

    for fn in sorted(hfns):
        h = b.read_histogram(fn)
        firstline = get_firstline(fn)
        if(type(firstline)==bytes and type(firstline)!=str):
            firstline = firstline.decode('utf-8')
        print("  {:>25} entries: {:>10,}  (top: {})".format(fn,len(h),firstline))

    fnpart = ".".join(b.image_filename().split('/')[-1].split('.')[:-1])

    if b.feature_files():
        ffns = sorted(list(b.feature_files()))
        features = {}
        print("")
        print("Feature Files:        {}".format(len(ffns)))
        for fn in ffns:     # feature files
            lines = 0
            for line in b.open(fn,'r'):
                if not bulk_extractor_reader.is_comment_line(line):
                    lines += 1
                    features[fn] = lines
            print("  {:>25} features: {:>12,}  {}".format(fn,lines,analyze_warning(fnpart,fn,lines)))
                    
        # If there is a SQLite database, analyze that too!
    if args.featurefile and args.featuresql:
        import sqlite3
        conn = sqlite3.connect(os.path.join(outdir,"report.sqlite"))
        if conn:
            c = conn.cursor()
            c.execute("PRAGMA cache_size = 200000")
            print("Comparing SQLite3 database to feature files:")
            for fn in ffns:
                try:
                    table = "f_"+fn.lower().replace(".txt","")
                    cmd = "select count(*) from "+table
                    print(cmd)
                    c.execute(cmd);
                    ct = c.fetchone()[0]
                    print("{}:   {}  {}".format(fn,features[fn],ct))
                    # Now check them all to make sure that the all match
                    count = 0
                    for line in b.open(fn,'r'):
                        ary = bulk_extractor_reader.parse_feature_line(line)
                        if ary:
                            (path,feature) = ary[0:2]
                            path = path.decode('utf-8')
                            feature = feature.decode('utf-8')
                            c.execute("select count(*) from "+table+" where path=? and feature_eutf8=?",(path,feature))
                            ct = c.fetchone()[0]
                            if ct==1:
                                #print("feature {} {} in table {} ({})".format(path,feature,table,ct))
                                pass
                            if ct==0:
                                #pass
                                print("feature {} {} not in table {} ({})".format(path,feature,table,ct))
                            count += 1
                            if count>args.featuretest: 
                                break

                except sqlite3.OperationalError as e:
                    print(e)

Example #6

0

Show file

def analyze_outdir(outdir):
    """Print statistics about an output directory"""
    print("Analyze {}".format(outdir))

    b = bulk_extractor_reader.BulkReport(outdir)
    print("bulk_extractor version: {}".format(b.version()))
    print("Image filename:         {}".format(b.image_filename()))
    
    # Print which scanners were run and how long they took
    analyze_reportxml(b.xmldoc)
    
    hfns = list(b.histogram_files()) # histogram files
    print("")
    print("Histogram Files:        {}".format(len(hfns)))

    def get_firstline(fn):
        """Returns the first line that is not a comment"""
        for line in b.open(fn,'rb'):
            if bulk_extractor_reader.is_comment_line(line):
                continue
            return line[:-1]

    for fn in sorted(hfns):
        h = b.read_histogram(fn)
        firstline = get_firstline(fn)
        if(type(firstline)==bytes and type(firstline)!=str):
            firstline = firstline.decode('utf-8')
        print("  {:>25} entries: {:>10,}  (top: {})".format(fn,len(h),firstline))

    fnpart = ".".join(b.image_filename().split('/')[-1].split('.')[:-1])

    ffns = sorted(list(b.feature_files())) 
    if ffns:
        features = {}
        print("")
        print("Feature Files:        {}".format(len(ffns)))
        for fn in ffns:     # feature files
            lines = 0
            for line in b.open(fn,'r'):
                if not bulk_extractor_reader.is_comment_line(line):
                    lines += 1
                    features[fn] = lines
            print("  {:>25} features: {:>12,}  {}".format(fn,lines,analyze_warning(fnpart,fn,lines)))
                    
        # If there is a SQLite database, analyze that too!
    if args.featurefile and args.featuresql:
        import sqlite3
        conn = sqlite3.connect(os.path.join(outdir,"report.sqlite"))
        if conn:
            c = conn.cursor()
            c.execute("PRAGMA cache_size = 200000")
            print("Comparing SQLite3 database to feature files:")
            for fn in ffns:
                try:
                    table = "f_"+fn.lower().replace(".txt","")
                    cmd = "select count(*) from "+table
                    print(cmd)
                    c.execute(cmd);
                    ct = c.fetchone()[0]
                    print("{}:   {}  {}".format(fn,features[fn],ct))
                    # Now check them all to make sure that the all match
                    count = 0
                    for line in b.open(fn,'r'):
                        ary = bulk_extractor_reader.parse_feature_line(line)
                        if ary:
                            (path,feature) = ary[0:2]
                            path = path.decode('utf-8')
                            feature = feature.decode('utf-8')
                            c.execute("select count(*) from "+table+" where path=? and feature_eutf8=?",(path,feature))
                            ct = c.fetchone()[0]
                            if ct==1:
                                #print("feature {} {} in table {} ({})".format(path,feature,table,ct))
                                pass
                            if ct==0:
                                #pass
                                print("feature {} {} not in table {} ({})".format(path,feature,table,ct))
                            count += 1
                            if count>args.featuretest: 
                                break

                except sqlite3.OperationalError as e:
                    print(e)