コード例 #1
0
    def compare_histograms(self):
        b1_histograms = set(self.b1.histogram_files())
        b2_histograms = set(self.b2.histogram_files())
        common_histograms = b1_histograms.intersection(b2_histograms)
        out = self.out

        if self.mode=='html':
            out.write("<ul>\n")
            for histogram_file in sorted(histogram_files):
                out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file,histogram_file))
            out.write("</ul>\n<hr/>\n")

        for histogram_file in sorted(common_histograms):
            diffcount = 0
            if args.html:
                out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file,histogram_file))
            t = ttable.ttable()
            t.set_col_alignment(0,t.RIGHT)
            t.set_col_alignment(1,t.RIGHT)
            t.set_col_alignment(2,t.RIGHT)
            t.set_col_alignment(3,t.LEFT)
            t.set_title(histogram_file)
            t.append_head(['# in PRE','# in POST','∆','Value'])

            (b1,b2) = self.getab()

            b1.hist = b1.read_histogram(histogram_file)
            b2.hist = b2.read_histogram(histogram_file)
            b1.keys = set(b1.hist.keys())
            b2.keys = set(b2.hist.keys())

            # Create the output, then we will sort on col 1, 2 and 4
            data = []
            for feature in b1.keys.union(b2.keys):
                v1 = b1.hist.get(feature,0)
                v2 = b2.hist.get(feature,0)
                if v1!=v2: diffcount += 1
                if v2>v1 or (v2==v1 and args.same) or (v2<v1 and args.smaller):
                    data.append((v1, v2, v2-v1, feature.decode('utf-8')))

            # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value
            def mysortkey(a):
                return (-a[2],a[3],a[1],a[0])

            if data:
                for row in sorted(data,key=mysortkey):
                    t.append_data(row)
                out.write(t.typeset(mode=self.mode))
            if diffcount==0 and args.both:
                if args.html:
                    out.write("{}: No differences\n".format(histogram_file))
                else:
                    out.write("{}: No differences\n".format(histogram_file))
コード例 #2
0
    def summary(self):
        (a,b) = self.getab()
        t = ttable.ttable()
        t.append_head(['file',a.name,b.name])
        all_files = a.files.union(b.files)
        for fn in sorted(all_files):
            if fn in a.files:
                adata = len(list(a.read_features(fn)))
            else:
                adata = 'n/a'

            if fn in b.files:
                bdata = len(list(b.read_features(fn)))
            else:
                bdata = 'n/a'
            if adata != bdata:
                t.append_data([fn, adata, bdata])
        self.out.write(t.typeset(mode=self.mode))
コード例 #3
0
ファイル: sceadan_train.py プロジェクト: WeiliangLuo/sceadan
def print_data():
    print("Data directory:", datadir())
    t = ttable()
    t.append_head(["FTYPE", "Files", "min blks", "max blks", "avg blks", "total"])
    t.set_col_alignment(1, t.RIGHT)
    t.set_col_alignment(2, t.RIGHT)
    t.set_col_alignment(3, t.RIGHT)
    t.set_col_alignment(4, t.RIGHT)
    t.set_col_alignment(5, t.RIGHT)
    blocks_per_type = {}
    for ftype in filetypes():
        if sceadan_type_for_name(ftype) == -1:
            raise RuntimeError("file type {} is invalid".format(ftype))

        blocks = [os.path.getsize(fn) // args.train_blocksize for fn in ftype_files(ftype)]
        blocks = list(filter(lambda v: v > 0, blocks))
        t.append_data((ftype, len(blocks), min(blocks), max(blocks), sum(blocks) / len(blocks), sum(blocks)))
        blocks_per_type[ftype] = sum(blocks)
    print(t.typeset(mode="text"))
コード例 #4
0
ファイル: sceadan_train.py プロジェクト: Jude-Sancti/sceadan
def print_data():
    print("Data directory:",datadir())
    t = ttable()
    t.append_head(["FTYPE","Files","min blks","max blks","avg blks","total"])
    t.set_col_alignment(1,t.RIGHT)
    t.set_col_alignment(2,t.RIGHT)
    t.set_col_alignment(3,t.RIGHT)
    t.set_col_alignment(4,t.RIGHT)
    t.set_col_alignment(5,t.RIGHT)
    blocks_per_type = {}
    for ftype in filetypes():
        if sceadan_type_for_name(ftype) == -1:
            raise RuntimeError("file type {} is invalid".format(ftype))

        blocks = [os.path.getsize(fn)//args.train_blocksize for fn in ftype_files(ftype)]
        blocks = list(filter(lambda v:v>0,blocks))
        t.append_data((ftype,len(blocks),min(blocks),max(blocks),sum(blocks)/len(blocks),sum(blocks)))
        blocks_per_type[ftype] = sum(blocks)
    print(t.typeset(mode='text'))
コード例 #5
0
    def compare_files(self):
        out = self.out
        if self.both:
            t = ttable.ttable()
            t.append_data(['bulk_diff.py Version:',__version__])
            t.append_data(['PRE Image:',self.b1.image_filename()])
            t.append_data(['POST Image:',self.b2.image_filename()])
            out.write(t.typeset(mode=self.mode))

        for i in [1,2]:
            (a,b) = self.getab(i)
            r = a.files.difference(b.files)
            total_diff  = sum([a.count_lines(f) for f in r if ".txt" in f])
            total_other = sum(1 for f in r if ".txt" not in f)
            if total_diff>0 or total_other>0 or args.both:
                print("Files only in {}:".format(a.name), file=out)
                for f in r:
                    if ".txt" in f:
                        print("     %s (%d lines)" % (f,a.count_lines(f)), file=out)
                    else:
                        print("     %s" % (f), file=out)
コード例 #6
0
ファイル: sceadan_train.py プロジェクト: Jude-Sancti/sceadan
def compare():
    comp = shelve.open(compname("experiment"),writeback=False) 
    if not 'RESULTS' in comp:
        print("Compare fail: %s did not store its scores in the database" %(args.compare))
        return
    print("\nAccuracy comparison between %s and %s" %(args.exp, args.compare))
    t = ttable()
    t.append_head(['type',args.compare,args.exp,' % change'])
    t.set_col_alignment(1,t.CENTER)
    t.set_col_alignment(2,t.CENTER)
    t.set_col_alignment(3,t.CENTER)

    expScore    = {}
    compScore   = {}

    for key, value in db['RESULTS'].items():
        temp = value.copy()
        expTally = temp['TALLY']
        count = sum(expTally.values())
        expScore[key] = int(expTally.get(key,0)*100/count)
      
    for key, value in comp['RESULTS'].items():
        temp = value.copy()
        compTally = temp['TALLY']
        count = sum(compTally.values())
        compScore[key] = int(compTally.get(key,0)*100/count)
    
    for key, value in sorted(expScore.items()):
        if key in compScore:
            data = [key, compScore[key], expScore[key]] + ["{:3.2f}%".format((expScore[key]-compScore[key])/float(compScore[key])*100)]
        else:
            data = [key,'N/A', compScore[key], 'N/A']

        t.append_data(data)

    print(t.typeset(mode='text'))
                 1.05)  # make it a little bigger than needed
        plt.ylim(-5, 105)
        plt.legend(loc='lower center', fontsize=9)
        plt.xticks(rotation=20)
        plt.axes().get_xaxis().set_major_formatter(
            mp.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        plt.savefig(fname)


if __name__ == "__main__":
    #
    # Write the data to a LaTeX file
    #

    topcode = 100000
    tab = ttable()
    tab.append_head(['parental income', 'grade'])
    tab.append_data(ttable.HR)
    for row in zip(income, grades):
        tab.append_data(row)
    tab.save_table("toy_regression_data.tex", mode='latex')

    # First show the regression with the real dat

    e = Stats(income, grades)
    e.plot_data()
    e.regress()
    e.savefig("toy_regression.pdf")
    true_fit = copy.deepcopy(e.fit)

    # Now show the 4 regressions with a point the explores the state space
コード例 #8
0
def process(out,dname1,dname2):
    mode = 'text'
    if options.html: mode='html'

    b1 = bulk_extractor_reader.BulkReport(dname1)
    b2 = bulk_extractor_reader.BulkReport(dname2)

    t = ttable.ttable()
    t.append_data(['bulk_diff.py Version:',bulk_diff_version])
    t.append_data(['PRE Image:',b1.image_filename()])
    t.append_data(['POST Image:',b2.image_filename()])
    out.write(t.typeset(mode=mode))

    for i in [1,2]:
        if i==1:
            a=b1;b=b2
        else:
            b=b1;a=b2;
        r = a.files.difference(b.files)
        if r:
            print("Files only in {}:".format(a.name))
            for f in r:
                if ".txt" in f:
                    print("     %s (%d lines)" % (f,a.count_lines(f)))
                else:
                    print("     %s" % (f))

    # Report interesting differences based on the historgrams.
    # Output Example:
        """
# in PRE     # in POST      ∆      Feature
10           20            10      [email protected]
 8           17             9      [email protected]
11           16             5      [email protected]
"""
    b1_histograms = set(b1.histogram_files())
    b2_histograms = set(b2.histogram_files())
    common_histograms = b1_histograms.intersection(b2_histograms)
    
    if options.html:
        out.write("<ul>\n")
        for histogram_file in sorted(histogram_files):
            out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file,histogram_file))
        out.write("</ul>\n<hr/>\n")

    for histogram_file in sorted(common_histograms):
        diffcount = 0
        if options.html:
            out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file,histogram_file))
        else:
            out.write('\n')
        t = ttable.ttable()
        t.set_col_alignment(0,t.RIGHT)
        t.set_col_alignment(1,t.RIGHT)
        t.set_col_alignment(2,t.RIGHT)
        t.set_col_alignment(3,t.LEFT)
        t.set_title(histogram_file)
        t.append_head(['# in PRE','# in POST','∆','Value'])

        b1.hist = b1.read_histogram(histogram_file)
        b2.hist = b2.read_histogram(histogram_file)
        b1.keys = set(b1.hist.keys())
        b2.keys = set(b2.hist.keys())

        # Create the output, then we will sort on col 1, 2 and 4
        data = []
        for feature in b1.keys.union(b2.keys):
            v1 = b1.hist.get(feature,0)
            v2 = b2.hist.get(feature,0)
            if v1!=v2: diffcount += 1
            if v2>v1 or (v2==v1 and options.same) or (v2<v1 and options.smaller):
                data.append((v1, v2, v2-v1, feature.decode('utf-8')))

        # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value
        def mysortkey(a):
            return (-a[2],a[3],a[1],a[0])

        if data:
            for row in sorted(data,key=mysortkey):
                t.append_data(row)
            out.write(t.typeset(mode=mode))
        if diffcount==0:
            if options.html:
                out.write("{}: No differences\n".format(histogram_file))
            else:
                out.write("{}: No differences\n".format(histogram_file))

            
    if options.features:
        for feature_file in b1.feature_files():
            if feature_file not in b2.feature_files():
                continue
            print("Compare features",feature_file)
            for p in [1,2]:
                if p==1:
                    a = b1; b = b2
                else:
                    a = b2; b = a
                a_features = {}
                for line in a.open(feature_file):
                    r = bulk_extractor_reader.parse_feature_line(line)
                    if not r: continue
                    a_features[r[0]] = r[1]
                for line in b.open(feature_file):
                    r = bulk_extractor_reader.parse_feature_line(line)
                    if not r: continue
                    if r[0] not in a_features:
                        print("{} {} is only in {}".format(r[0],r[1],a.name))
コード例 #9
0
            files += glob.glob(fn)
        else:
            files += [fn]

    drive_encoding_counts = {}
    for fn in files:
        print("")
        d = Drive(fn)
        d.process()
        for ff in d.f_encoding_counts:
            if ff not in drive_encoding_counts: drive_encoding_counts[ff] = defaultdict(statbag)
            for encoding in d.f_encoding_counts[ff]:
                drive_encoding_counts[ff][encoding].addx(d.f_encoding_counts[ff][encoding])
                
    # Now that the data have been collected, typeset the big table
    t = ttable.ttable()
    t.latex_colspec = "lrrrrr"
    t.append_head(('',               'Drives with','Feature', 'avg',      'max',      ''))
    t.append_head(('Feature / Coding','coding' ,'Count','per drive','per drive','$\\sigma$'))
    t.set_col_alignment(1,t.LEFT)
    t.set_col_alignment(2,t.RIGHT)
    t.set_col_alignment(3,t.RIGHT)
    t.set_col_alignment(4,t.RIGHT)
    t.set_col_alignment(5,t.RIGHT)
    
    rep = []                    # report will be sorted by second column

    print("\n"*4)
    for ff in sorted(drive_encoding_counts.keys()):
        for enc in sorted(drive_encoding_counts[ff].keys()):
            k = ff + " / " + str(enc)
コード例 #10
0
def process(out, dname1, dname2):
    mode = 'text'
    if options.html: mode = 'html'

    b1 = bulk_extractor_reader.BulkReport(dname1)
    b2 = bulk_extractor_reader.BulkReport(dname2)

    t = ttable.ttable()
    t.append_data(['bulk_diff.py Version:', bulk_diff_version])
    t.append_data(['PRE Image:', b1.image_filename()])
    t.append_data(['POST Image:', b2.image_filename()])
    out.write(t.typeset(mode=mode))

    if b1.files.difference(b2.files):
        print("Files only in %s:\n   %s" %
              (b1.name, " ".join(b1.files.difference(b2.files))))
    if b2.files.difference(b1.files):
        print("Files only in %s:\n   %s" %
              (b2.name, " ".join(b2.files.difference(b1.files))))

        # Report interesting differences based on the historgrams.
        # Output Example:
        """
# in PRE     # in POST      ∆      Feature
10           20            10      [email protected]
 8           17             9      [email protected]
11           16             5      [email protected]
"""
    common_files = b1.files.intersection(b2.files)
    histogram_files = filter(lambda a: "histogram" in a, common_files)

    if options.html:
        out.write("<ul>\n")
        for histogram_file in sorted(histogram_files):
            out.write("<li><a href='#%s'>%s</a></li>\n" %
                      (histogram_file, histogram_file))
        out.write("</ul>\n<hr/>\n")

    diffcount = 0
    for histogram_file in sorted(histogram_files):
        if options.html:
            out.write('<h2><a name="%s">%s</a></h2>\n' %
                      (histogram_file, histogram_file))
        else:
            out.write('\n')
        t = ttable.ttable()
        t.set_col_alignment(0, t.RIGHT)
        t.set_col_alignment(1, t.RIGHT)
        t.set_col_alignment(2, t.RIGHT)
        t.set_col_alignment(3, t.LEFT)
        t.set_title(histogram_file)
        t.append_head(['# in PRE', '# in POST', '∆', 'Value'])

        b1.hist = b1.read_histogram(histogram_file)
        b2.hist = b2.read_histogram(histogram_file)
        b1.keys = set(b1.hist.keys())
        b2.keys = set(b2.hist.keys())

        # Create the output, then we will sort on col 1, 2 and 4
        data = []
        for feature in b1.keys.union(b2.keys):
            v1 = b1.hist.get(feature, 0)
            v2 = b2.hist.get(feature, 0)
            if v1 != v2: diffcount += 1
            if v2 > v1 or (v2 == v1 and options.same) or (v2 < v1
                                                          and options.smaller):
                data.append((v1, v2, v2 - v1, feature))

        # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value
        def mysortkey(a):
            return (-a[2], a[3], a[1], a[0])

        if data:
            for row in sorted(data, key=mysortkey):
                t.append_data(row)
            out.write(t.typeset(mode=mode))
        if diffcount == 0:
            if options.html:
                out.write("{}: No differences\n\n".format(histogram_file))
            else:
                out.write("{}: No differences\n\n".format(histogram_file))
コード例 #11
0
ファイル: sceadan_train.py プロジェクト: WeiliangLuo/sceadan
def generate_confusion():
    if os.path.exists(expname("confusion.txt")):
        print("Confusion matrix already exists")
        return

    if not os.path.exists(model_file()):
        raise RuntimeError("Cannot generate confusion matrix. Model file {} does not exist".format(model_file()))

    print("Generating confusion matrix")
    t = ttable()

    db["test_blocksize"] = args.test_blocksize

    # t.append_head(['File Type','Classifies as'])

    # Okay. Get these in a row with a threadpool
    if args.j > 1:
        import multiprocessing

        pool = multiprocessing.Pool(args.j)
    else:
        import multiprocessing.dummy

        pool = multiprocessing.dummy.Pool(1)
    sceadan_score_rows = {}
    files_per_type = {}
    blocks_per_type = {}
    classified_types = set()

    for (ftype, tally, file_count, block_count) in pool.imap_unordered(get_sceadan_score_for_filetype, filetypes()):
        sceadan_score_rows[ftype] = tally
        classified_types = classified_types.union(set(tally.keys()))
        files_per_type[ftype] = file_count
        blocks_per_type[ftype] = block_count

    classtypes = [t.upper() for t in sorted(classified_types)]

    # Get a list of all the classified types
    # And calculate the precision and recall

    t.append_head(["    ", "file ", "block"])
    t.append_head(["type", "count", "count"] + classtypes)
    t.set_col_alignment(2, t.RIGHT)
    t.set_col_alignment(3, t.RIGHT)
    total_events = 0
    total_correct = 0
    percent_correct_sum = 0
    rowcounter = 0
    for ftype in filetypes():
        FTYPE = ftype.upper()
        tally = sceadan_score_rows[ftype]
        count = sum(tally.values())
        total_events += count
        total_correct += tally.get(ftype.upper(), 0)
        percent_correct = tally.get(ftype.upper(), 0) * 100.0 / count
        percent_correct_sum += percent_correct

        # Generate each row of the output

        data = [FTYPE, files_per_type[FTYPE], blocks_per_type[FTYPE]] + [
            "{:3.0f}".format(tally.get(f, 0) * 100.0 / count) for f in classtypes
        ]
        t.append_data(data)
        rowcounter += 1
        if rowcounter % 5 == 0:
            t.append_data([""])
    f = openexp("confusion.txt", "w")
    f.write(t.typeset(mode="text"))
    print(t.typeset(mode="text"))
    db["overall_accuracy"] = (total_correct * 100.0) / total_events
    db["average_accuracy_per_class"] = percent_correct_sum / len(filetypes())

    def info(n):
        """Print a value from the database and print in confusion.txt"""
        try:
            v = str(db[n])[0:120]
            val = "{}: {}".format(n.replace("_", " "), v)
            f.write(val + "\n")
            print(val)
        except KeyError as e:
            f.write("key {} not found\n".format(n))

    print("Keys in database:", list(db.keys()))
    info("train_blocksize")
    info("test_blocksize")
    info("liblinear_train_command")
    info("overall_accuracy")
    info("average_accuracy_per_class")
コード例 #12
0
ファイル: bulk_diff.py プロジェクト: BruceMty/bulk_extractor
def process(out,dname1,dname2):
    mode = 'text'
    if options.html: mode='html'

    b1 = bulk_extractor_reader.BulkReport(dname1)
    b2 = bulk_extractor_reader.BulkReport(dname2)

    t = ttable.ttable()
    t.append_data(['bulk_diff.py Version:',bulk_diff_version])
    t.append_data(['PRE Image:',b1.imagefile()])
    t.append_data(['POST Image:',b2.imagefile()])
    out.write(t.typeset(mode=mode))

    if b1.files.difference(b2.files):
        print("Files only in %s:\n   %s" % (b1.name," ".join(b1.files.difference(b2.files))))
    if b2.files.difference(b1.files):
        print("Files only in %s:\n   %s" % (b2.name," ".join(b2.files.difference(b1.files))))

    # Report interesting differences based on the historgrams.
    # Output Example:
        """
# in PRE     # in POST      ∆      Feature
10           20            10      [email protected]
 8           17             9      [email protected]
11           16             5      [email protected]
"""
    common_files = b1.files.intersection(b2.files)
    histogram_files = filter(lambda a:"histogram" in a,common_files)
    
    if options.html:
        out.write("<ul>\n")
        for histogram_file in sorted(histogram_files):
            out.write("<li><a href='#%s'>%s</a></li>\n" % (histogram_file,histogram_file))
        out.write("</ul>\n<hr/>\n")

    diffcount = 0
    for histogram_file in sorted(histogram_files):
        if options.html:
            out.write('<h2><a name="%s">%s</a></h2>\n' % (histogram_file,histogram_file))
        else:
            out.write('\n')
        t = ttable.ttable()
        t.set_col_alignment(0,t.RIGHT)
        t.set_col_alignment(1,t.RIGHT)
        t.set_col_alignment(2,t.RIGHT)
        t.set_col_alignment(3,t.LEFT)
        t.set_title(histogram_file)
        t.append_head(['# in PRE','# in POST','∆','Value'])

        b1.hist = b1.read_histogram(histogram_file)
        b2.hist = b2.read_histogram(histogram_file)
        b1.keys = set(b1.hist.keys())
        b2.keys = set(b2.hist.keys())

        # Create the output, then we will sort on col 1, 2 and 4
        data = []
        for feature in b1.keys.union(b2.keys):
            v1 = b1.hist.get(feature,0)
            v2 = b2.hist.get(feature,0)
            if v1!=v2: diffcount += 1
            if v2<=v1 and not options.smaller: continue
            data.append((v1, v2, v2-v1, feature))

        # Sort according the diff first, then v2 amount, then v1 amount, then alphabetically on value
        def mysortkey(a):
            return (-a[2],a[3],a[1],a[0])

        if data:
            for row in sorted(data,key=mysortkey):
                t.append_data(row)
            out.write(t.typeset(mode=mode))
        if diffcount==0:
            if options.html:
                out.write("{}: No differences\n\n".format(histogram_file))
            else:
                out.write("{}: No differences\n\n".format(histogram_file))
コード例 #13
0
ファイル: ireport.py プロジェクト: grayed/dfxml
def process_files(fn):
    drive_files = {}                         # index of drives
    all_parts  = []
    all_files = []
    files_by_md5 = {}           # a dictionary of sets of fiobject, indexed by md5
    extension_len_histogram = histogram2d()
    extension_fragments_histogram = histogram2d()
    partition_histogram = histogram2d()

    def cb(fi):
        # add the md5 to the set
        if fi.is_file() and fi.filesize():
            files_by_md5.get(fi.md5,set()).add(fi)
            ext = fi.ext()
            if not ext: print(fi.meta_type(),fi)
            extension_len_histogram.add(ext,fi.filesize())
            extension_fragments_histogram.add(ext,fi.fragments())
            partition_histogram.add(fi.partition(),fi.filesize())

    if fn.endswith('xml'):
        fiwalk.fiwalk_using_sax(xmlfile=open(fn),callback=cb)
    else:
        fiwalk.fiwalk_using_sax(imagefile=open(fn),callback=cb)


    #
    # Typeset the information
    #

    tab = ttable()
    tab.header     = "File extension popularity and average size (suppressing 0-len files)"
    tab.col_headings = [['Ext','Count','Average Size','Max','Std Dev']]
    tab.omit_row = [[0,'']]
    extension_len_histogram.statcol = ['iaverage','maxx','istddev']
    print(extension_len_histogram.typeset(tab=tab))

    #
    # Information about fragmentation patterns
    #
    tab = ttable()
    tab.header="Fragmentation pattern by file system and file type:"
    tab.col_headings = [['Ext','Count','Average Size','Max','Std Dev']]
    tab.omit_row = [[0,'']]
    extension_fragments_histogram.statcol = ['iaverage','maxx','istddev']
    print(extension_fragments_histogram.typeset(tab=tab))
    exit(0)

    for fstype in fstypes:
        for ftype in ['jpg','pdf','doc','txt']:
            len1stats = statbag()
            len2stats = statbag()
            delta_hist = histogram()
            delta_re = re.compile("(\d+)\-?(\d+)? ?(\d+)\-?(\d+)?")
            for i in filter((lambda f: f.ext()==ftype and f.fragments==2),all_files):
                runs = False
                if(hasattr(i,'block_runs')): runs = i.block_runs
                if(hasattr(i,'sector_runs')): runs = i.sector_runs
                if not runs: continue
                m = delta_re.search(runs)
                r = []
                for j in range(1,5):
                    try:
                        r.append(int(m.group(j)))
                    except TypeError:
                        r.append(int(m.group(j-1)))

                len1 = r[1] - r[0] + 1
                len2 = r[3] - r[2] + 1
                delta = r[2]-r[1]

                len1stats.addx(len1)
                len2stats.addx(len2)
                delta_hist.add(delta)

            if len1stats.count()>0:
                print("\n\n")
                print("fstype:",fstype,"  ftype:",ftype)
                print("len1 average: %f stddev: %f" % (len1stats.average(),len1stats.stddev()))
                print("len2 average: %f stddev: %f" % (len2stats.average(),len2stats.stddev()))
                print("delta average: %f" % delta_hist.average())
                print("delta histogram:")
                delta_hist.print_top(10)


    exit(0)


    print("Partition histogram:")
    partition_histogram.print_top(n=100)
    print("Counts by extension:")
    extension_len_histogram.print_top(n=100)
    print("Fragments by extension:")
    extension_fragments_histogram.print_top(n=100)

    exit(0)
    for fstype in fstypes:
        if fstype=='(unrecognized)': continue
        print(fstype,"Partitions:")

        def isfstype(x): return x.fstype==fstype
        these_parts = filter(isfstype,all_parts)
        these_files = []
        for part in these_parts:
            these_files.extend(part.files)
        print(fragmentation_table(these_files))


    exit(0)

    sys.exit(0)


    #
    # Typeset information about file extensions
    #
    hist_exts = histogram2d()
    hist_exts.topn = 20
    for i in all_files:
        if i.size>0 and i.fragments>0: hist_exts.add(i.ext(),i.size)
    tab = table()
    tab.header     = "File extension popularity and average size (suppressing 0-len files)"
    tab.col_headings = ['Ext','Count','Average Size','Max','Std Dev']
    tab.omit_row = [[0,'']]
    hist_exts.statcol = ['iaverage','maxx','istddev']
    print(hist_exts.typeset(t=tab))

    hist_exts = histogram2d()
    hist_exts.topn = 20
    for i in all_files:
        if i.fragments>0: hist_exts.add(i.ext(),i.fragments)
    tab = table()
    tab.header     = "Fragmentation by file extension (suppressing files with 0 fragments)"
    tab.col_headings = ['Ext','Count','Avg Fragments','Max','Std Dev']
    tab.omit_row = [[0,'']]
    hist_exts.statcol = ['average','maxx','stddev']
    print(hist_exts.typeset(t=tab))

    print("===========================")


    #
    # Typeset the File Systems on Drives table
    #

    tab = table()
    tab.header     = "File Systems on Drives"
    tab.col_headings = ["FS Type","Drives","MBytes"]
    tab.col_totals = [1,2]
    fstypeh.statcol = 'sumx'
    print(fstypeh.typeset(t=tab))

    #
    # Typeset overall fragmentation stats
    #

    print(fragmentation_table(all_files))
コード例 #14
0
ファイル: sceadan_train.py プロジェクト: Jude-Sancti/sceadan
def generate_confusion():
    if os.path.exists(expname("confusion.txt")):
        print("Confusion matrix already exists")
        return

    if not os.path.exists(model_file()):
        raise RuntimeError("Cannot generate confusion matrix. Model file {} does not exist".format(model_file()))
        
    print("Generating confusion matrix")
    t = ttable()

    db['test_blocksize'] = args.test_blocksize

    sceadan_score_rows      = {}
    files_per_type          = {}
    blocks_per_type         = {}
    classified_types        = set()

    for key, value in db['RESULTS'].items():
        temp = value.copy()
        sceadan_score_rows[key] = temp['TALLY']
        classified_types        = classified_types.union(set(temp['TALLY'].keys()))
        files_per_type[key]     = temp['FILE_COUNT']
        blocks_per_type[key]    = temp['BLOCK_COUNT']
        
    classtypes = [t.upper() for t in sorted(classified_types)]

    t.append_head(['    ','file ', 'block'])
    t.append_head(['type','count', 'count'] + classtypes)
    t.set_col_alignment(2,t.RIGHT)
    t.set_col_alignment(3,t.RIGHT)
    total_events = 0
    total_correct = 0
    percent_correct_sum= 0
    rowcounter = 0
    for ftype in filetypes():
        FTYPE         = ftype.upper()
        tally         = sceadan_score_rows[ftype]
        count         = sum(tally.values())
        total_events  += count
        total_correct += tally.get(ftype.upper(),0)
        percent_correct = tally.get(ftype.upper(),0)*100.0 / count
        percent_correct_sum += percent_correct

        # Generate each row of the output
        
        data = [FTYPE, files_per_type.get(FTYPE,0), blocks_per_type.get(FTYPE,0)] + ["{:3.0f}".format(tally.get(f,0)*100.0/count) for f in classtypes]
        t.append_data(data)
        rowcounter += 1
        if rowcounter % 5 ==0:
            t.append_data([''])
    f = openexp("confusion.txt","w")
    f.write(t.typeset(mode='text'))
    print(t.typeset(mode='text'))
    db['overall_accuracy'] = (total_correct*100.0)/total_events
    db['average_accuracy_per_class'] = percent_correct_sum/len(filetypes())

    def info(n):
        """Print a value from the database and print in confusion.txt"""
        try:
            v = str(db[n])[0:120]
            val = "{}: {}".format(n.replace("_"," "),v)
            f.write(val+"\n")
            print(val)
        except KeyError as e:
            f.write("key {} not found\n".format(n))
    print("Keys in database:",list(db.keys()))
    info('train_blocksize')
    info('test_blocksize')
    info('liblinear_train_command')
    info('overall_accuracy')
    info('average_accuracy_per_class')
コード例 #15
0
def process_files(fn):
    drive_files = {}  # index of drives
    all_parts = []
    all_files = []
    files_by_md5 = {}  # a dictionary of sets of fiobject, indexed by md5
    extension_len_histogram = histogram2d()
    extension_fragments_histogram = histogram2d()
    partition_histogram = histogram2d()

    def cb(fi):
        # add the md5 to the set
        if fi.is_file() and fi.filesize():
            files_by_md5.get(fi.md5, set()).add(fi)
            ext = fi.ext()
            if not ext: print fi.meta_type(), fi
            extension_len_histogram.add(ext, fi.filesize())
            extension_fragments_histogram.add(ext, fi.fragments())
            partition_histogram.add(fi.partition(), fi.filesize())

    if fn.endswith('xml'):
        fiwalk.fiwalk_using_sax(xmlfile=open(fn), callback=cb)
    else:
        fiwalk.fiwalk_using_sax(imagefile=open(fn), callback=cb)

    #
    # Typeset the information
    #

    tab = ttable()
    tab.header = "File extension popularity and average size (suppressing 0-len files)"
    tab.col_headings = [['Ext', 'Count', 'Average Size', 'Max', 'Std Dev']]
    tab.omit_row = [[0, '']]
    extension_len_histogram.statcol = ['iaverage', 'maxx', 'istddev']
    print extension_len_histogram.typeset(tab=tab)

    #
    # Information about fragmentation patterns
    #
    tab = ttable()
    tab.header = "Fragmentation pattern by file system and file type:"
    tab.col_headings = [['Ext', 'Count', 'Average Size', 'Max', 'Std Dev']]
    tab.omit_row = [[0, '']]
    extension_fragments_histogram.statcol = ['iaverage', 'maxx', 'istddev']
    print extension_fragments_histogram.typeset(tab=tab)
    exit(0)

    for fstype in fstypes:
        for ftype in ['jpg', 'pdf', 'doc', 'txt']:
            len1stats = statbag()
            len2stats = statbag()
            delta_hist = histogram()
            delta_re = re.compile("(\d+)\-?(\d+)? ?(\d+)\-?(\d+)?")
            for i in filter((lambda
                             (f): f.ext() == ftype and f.fragments == 2),
                            all_files):
                runs = False
                if (hasattr(i, 'block_runs')): runs = i.block_runs
                if (hasattr(i, 'sector_runs')): runs = i.sector_runs
                if not runs: continue
                m = delta_re.search(runs)
                r = []
                for j in range(1, 5):
                    try:
                        r.append(int(m.group(j)))
                    except TypeError:
                        r.append(int(m.group(j - 1)))

                len1 = r[1] - r[0] + 1
                len2 = r[3] - r[2] + 1
                delta = r[2] - r[1]

                len1stats.addx(len1)
                len2stats.addx(len2)
                delta_hist.add(delta)

            if len1stats.count() > 0:
                print "\n\n"
                print "fstype:", fstype, "  ftype:", ftype
                print "len1 average: %f stddev: %f" % (len1stats.average(),
                                                       len1stats.stddev())
                print "len2 average: %f stddev: %f" % (len2stats.average(),
                                                       len2stats.stddev())
                print "delta average: %f" % delta_hist.average()
                print "delta histogram:"
                delta_hist.print_top(10)
コード例 #16
0
ファイル: ireport.py プロジェクト: Acidburn0zzz/dfxml
def process_files(fn):
    drive_files = {}                         # index of drives
    all_parts  = []
    all_files = []
    files_by_md5 = {}           # a dictionary of sets of fiobject, indexed by md5
    extension_len_histogram = histogram2d()
    extension_fragments_histogram = histogram2d()
    partition_histogram = histogram2d()

    def cb(fi):
        # add the md5 to the set
        if fi.is_file() and fi.filesize():
            files_by_md5.get(fi.md5,set()).add(fi)
            ext = fi.ext()
            if not ext: print fi.meta_type(),fi
            extension_len_histogram.add(ext,fi.filesize())
            extension_fragments_histogram.add(ext,fi.fragments())
            partition_histogram.add(fi.partition(),fi.filesize())

    if fn.endswith('xml'):
        fiwalk.fiwalk_using_sax(xmlfile=open(fn),callback=cb)
    else:
        fiwalk.fiwalk_using_sax(imagefile=open(fn),callback=cb)
    

    #
    # Typeset the information
    #

    tab = ttable()
    tab.header     = "File extension popularity and average size (suppressing 0-len files)"
    tab.col_headings = [['Ext','Count','Average Size','Max','Std Dev']]
    tab.omit_row = [[0,'']]
    extension_len_histogram.statcol = ['iaverage','maxx','istddev']
    print extension_len_histogram.typeset(tab=tab)

    #
    # Information about fragmentation patterns
    #
    tab = ttable()
    tab.header="Fragmentation pattern by file system and file type:"
    tab.col_headings = [['Ext','Count','Average Size','Max','Std Dev']]
    tab.omit_row = [[0,'']]
    extension_fragments_histogram.statcol = ['iaverage','maxx','istddev']
    print extension_fragments_histogram.typeset(tab=tab)
    exit(0)

    for fstype in fstypes:
        for ftype in ['jpg','pdf','doc','txt']:
            len1stats = statbag()
            len2stats = statbag()
            delta_hist = histogram()
            delta_re = re.compile("(\d+)\-?(\d+)? ?(\d+)\-?(\d+)?")
            for i in filter( (lambda(f): f.ext()==ftype and f.fragments==2),all_files):
                runs = False
                if(hasattr(i,'block_runs')): runs = i.block_runs
                if(hasattr(i,'sector_runs')): runs = i.sector_runs
                if not runs: continue
                m = delta_re.search(runs)
                r = []
                for j in range(1,5):
                    try:
                        r.append(int(m.group(j)))
                    except TypeError:
                        r.append(int(m.group(j-1)))

                len1 = r[1] - r[0] + 1
                len2 = r[3] - r[2] + 1
                delta = r[2]-r[1]
                
                len1stats.addx(len1)
                len2stats.addx(len2)
                delta_hist.add(delta)

            if len1stats.count()>0:
                print "\n\n"
                print "fstype:",fstype,"  ftype:",ftype
                print "len1 average: %f stddev: %f" % (len1stats.average(),len1stats.stddev())
                print "len2 average: %f stddev: %f" % (len2stats.average(),len2stats.stddev())
                print "delta average: %f" % delta_hist.average()
                print "delta histogram:"
                delta_hist.print_top(10)