def main(): if len(sys.argv) < 2: print("Usage: {} <filename.xml>".format(sys.argv[0])) exit(1) dfxml.read_dfxml(xmlfile=open(sys.argv[1],"rb"), callback=process) timeline.sort() for record in timeline: print("\t".join( map(str, record)) )
def main(): filemetadata_out = open("filemetadata.sql", "w") md5_out = open("md5.sql", "w") (conn, cursor) = differ_library.db_conn_from_config_path(args.config) #Get slice hash cursor.execute("SELECT slicehash FROM diskprint.storage WHERE location = %s", (args.slice_path,)) inrows = [row for row in cursor] if len(inrows) != 1: logging.error("Could not find diskprint from tarball path: %r." % args.slice_path) sys.exit(1) slicehash = inrows[0]["slicehash"] def process_fi(fi): """ Produce SQL records for every allocated file. (This is an inline function so the value of 'slicehash' is in scope.) """ #Only allocated, regular files if not fi.allocated(): return if fi.name_type() != "r": return #Build SQL templates md5_insert_template = "insert into diskprint.MD5 values ('%(keyhash)s','%(keyhash_md5)s');\n" filemetadata_insert_template = "insert into diskprint.filemetadata (keyhash, slicehash, path, filename, extension, bytes, mtime, ctime) values ('%(keyhash)s','%(slicehash)s','%(path)s','%(filename)s','%(extension)s',%(bytes)d,'%(mtime)s','%(ctime)s');\n" #Build SQL values as substitution dictionary d = dict() d["keyhash"] = fi.sha1() d["keyhash_md5"] = fi.md5() d["slicehash"] = slicehash d["path"] = fi.filename() d["filename"] = os.path.basename(fi.filename()) d["extension"] = os.path.splitext(fi.filename())[1] d["bytes"] = fi.filesize() d["mtime"] = fi.mtime() d["ctime"] = fi.crtime() #TODO What does this table actually mean by ctime? Change, or create? #Output filemetadata_out.write(filemetadata_insert_template % d) md5_out.write(md5_insert_template % d) #Begin loop through XML dfxml.read_dfxml(xmlfile=open(args.fiwalk_xml, "rb"), callback=process_fi)
max(data))) if __name__ == "__main__": from argparse import ArgumentParser from copy import deepcopy parser = ArgumentParser( description='Report information about a DFXML file') parser.add_argument('xmlfiles', help='XML files to process', nargs='+') parser.add_argument( "--files", help="Report on file objects that the DFXML file contains", action='store_true') parser.add_argument( "--imagefile", help="specifies imagefile to examine; automatically runs fiwalk", nargs='+') args = parser.parse_args() ds = DiskSet() if args.files: dfxml.read_dfxml(xmlfile=open(fn, 'rb'), callback=ds.pass1) if ds.uniques() > 0: ds.print_dups_report() exit(0) for fn in args.xmlfiles: dfxml_info(fn)
action="store_true") parser.add_option("--blocksize", help="specify sector blocksize", default=512) (options, args) = parser.parse_args() if len(args) < 1: parser.print_help() sys.exit(1) fn = args[0] print(args) print("Processing %s" % fn) print("Searching for %s" % ", ".join(args[1:])) divisor = 1 if options.offset: divisor = options.blocksize sectors = set([int(s) / divisor for s in args[1:]]) def process(fi): for s in sectors: if fi.has_sector(s): print("%d\t%s" % (s, fi.filename())) if not fn.endswith(".xml"): print("iblkfind requires an XML file") exit(1) dfxml.read_dfxml(xmlfile=open(args[0], "rb"), callback=process)
if (__name__ == "__main__"): from argparse import ArgumentParser parser = ArgumentParser() parser.usage =\ """ dfxml_xtract.py [args] dfxml imagefile Using the metadata from DFXML file, extract each file object from a target image file. """ parser.add_argument("dfxml", help="Specify the target DFXML file") parser.add_argument("image", help="Specify the target image file") args = parser.parse_args() # Set up output directory for extracted files dfxml_path = os.path.abspath(args.dfxml) image_path = os.path.abspath(args.image) dir_name = os.getcwd() + "/output" if os.path.isdir(dir_name): os.chdir(dir_name) else: os.mkdir('output') os.chdir('output') print("Working Dir :", os.getcwd()) print("Target DFXML:", dfxml_path) print("Target IMAGE:", image_path) dfxml.read_dfxml(xmlfile=open(dfxml_path, 'rb'), callback=extract_file)
# Demo program that shows how to calculate the average size of file objects in a DFXML file # import math import sys import os import collections sys.path.append( os.path.join(os.path.dirname(__file__), "..")) import dfxml sums = collections.Counter() sum_of_squares= collections.Counter() count = collections.Counter() def func(fi): ext = fi.ext() count[ext] += 1 sums[ext] += fi.filesize() sum_of_squares[ext] = fi.filesize() ** 2 dfxml.read_dfxml(xmlfile=open(sys.argv[1],'rb'),callback=func) fmt = "{:8} {:8} {:8} {:8} {:8}" print(fmt.format("Ext","Count","Total","Average","StdDev")) for ext in sums.keys(): print(fmt.format(ext, count[ext], sums[ext], sums[ext]/count[ext], math.sqrt(sum_of_squares[ext]/count[ext] - (sums[ext]/count[ext])**2)))
f_tmp_filename.close() self.hasdb = {} # to clear the memory print ("singletons {}, pairs {}, triples {}, others{}".format(self.singletons, self.pairs, self.triples, self.others)) if __name__=="__main__": import argparse,os,sys sys.path.append(os.getenv("DOMEX_HOME") + "/src/lib/") # add the library sys.path.append(os.getenv("DOMEX_HOME") + "/src/dfxml/python/") # add the library import dfxml,subprocess parser = argparse.ArgumentParser(description="A program that takes a directory of files, computes sector-based statistics.") parser.add_argument("--dir", help="Directory of files that will be analyzed") parser.add_argument("--bs", help="Specifies a block size for piecewise hashing", type=int, default=512) parser.add_argument("--file", help="Specifies a file that contains output from md5deep", type=str, default=None) args = parser.parse_args() sc = SectorCorrelator() if args.file is None: p = subprocess.Popen(['md5deep','-dp '+ str(args.bs), '-r', args.dir],stdout=subprocess.PIPE) dfxml.read_dfxml(xmlfile=p.stdout,callback=sc.process) else: dfxml.read_dfxml(xmlfile=open(args.file,'rb'),callback=sc.process) sc.print_report() sc.gen_file_stats() sc.print_file_report() sc.gen_filetype_stats() sc.print_filetype_report()
list) # key is the MD5 code, value is a list of matches self.files = 0 self.sectors = 0 def process(self, fi): """ Process the <fileobject> objects as they are read from the XML file""" self.files += 1 print(fi.filename()) for br in fi.byte_runs(): self.sectors += 1 self.hashdb[br.hashdigest['md5']].append( (fi.filename(), br.file_offset)) def print_report(self): print("Files processed: {}".format(self.files)) print("Sectors processed: {}".format(self.sectors)) print("") print("The following duplicates were found:") print("Hash Filename Offset in file") for (hash, ents) in self.hashdb.items(): if len(ents) > 1: print("{} -- {} copies found".format(hash, len(ents))) for e in sorted(ents): print(" {} {:8,}".format(e[0], e[1])) print("") sc = SectorCorrelator() dfxml.read_dfxml(xmlfile=open(sys.argv[1], 'rb'), callback=sc.process) sc.print_report()
def process(self, fname): self.fname = fname dfxml.read_dfxml(xmlfile=open(fname, "rb"), callback=self.process_fi)
if __name__=="__main__": from argparse import ArgumentParser global options parser = ArgumentParser() parser.add_argument("dfxml",type=str) parser.add_argument("--verbose",action="store_true") parser.add_argument("--prefix",type=str,help="Only output files with the given prefix") parser.add_argument("--distinct",action='store_true',help='Report the distinct files') parser.add_argument("--dups",action='store_true',help='Report the files that are dups, and give dup count') args = parser.parse_args() dobj = dedup() try: dfxml.read_dfxml(open(args.dfxml,'rb'),callback=dobj.process) except xml.parsers.expat.ExpatError: pass print("Total files: {:,} total MD5s processed: {:,} Unique MD5s: {:,}".format(dobj.files,dobj.md5s,len(dobj.seen))) if args.distinct: def report_distinct(names): if args.prefix and not names[0].startswith(args.prefix): return print("distinct: ",names[0]) dobj.report(lambda names:len(names)==1,report_distinct) if args.dups: def report_dups(names): for name in names: if not args.prefix or name.startswith(args.prefix):
parser.usage = '%prog [options] imagefile-or-xmlfile s1 [s2 s3 s3 ...]' parser.add_option("--offset",help="values are byte offsets, not sectors",action="store_true") parser.add_option("--blocksize",help="specify sector blockszie",default=512) (options,args) = parser.parse_args() if len(args)<1: parser.print_help() sys.exit(1) fn = args[0] print(args) print("Processing %s" % fn) print("Searching for %s" % ", ".join(args[1:])) divisor = 1 if options.offset: divisor = options.blocksize sectors = set([int(s)/divisor for s in args[1:]]) def process(fi): for s in sectors: if fi.has_sector(s): print("%d\t%s" % (s,fi.filename())) if not fn.endswith(".xml"): print("iblkfind requires an XML file") exit(1) dfxml.read_dfxml(xmlfile=open(args[0]),callback=process)
def read(self,f): if type(f)==str: self.fname = f f = open(f,'rb') dfxml.read_dfxml(xmlfile=f,callback=self.pass1)
class SectorCorrelator: def __init__(self): self.hashdb = collections.defaultdict(list) # key is the MD5 code, value is a list of matches self.files = 0 self.sectors = 0 def process(self,fi): """ Process the <fileobject> objects as they are read from the XML file""" self.files += 1 print(fi.filename()) for br in fi.byte_runs(): self.sectors += 1 self.hashdb[br.hashdigest['md5']].append((fi.filename(),br.file_offset)) def print_report(self): print("Files processed: {}".format(self.files)) print("Sectors processed: {}".format(self.sectors)) print("") print("The following duplicates were found:") print("Hash Filename Offset in file") for (hash,ents) in self.hashdb.items(): if len(ents)>1: print("{} -- {} copies found".format(hash,len(ents))) for e in sorted(ents): print(" {} {:8,}".format(e[0],e[1])) print("") sc = SectorCorrelator() dfxml.read_dfxml(xmlfile=open(sys.argv[1],'rb'),callback=sc.process) sc.print_report()
if fi.is_file(): self.fi_by_md5.setdefault(fi.md5(),[]).append(fi) def print_dups_report(self): print("Duplicates:") # First extract the dups, then sort them dups = filter(lambda a:len(a[1])>1,self.fi_by_md5.items(),) dup_bytes = 0 for (md5hash,fis) in sorted(dups,key=lambda a:a[1][0].filesize(),reverse=True): for fi in fis: print("{:>16,} {:32} {}".format(fi.filesize(),fi.md5(),fi.filename())) print() dup_bytes += fis[0].filesize() * (len(fis)-1) print("Total duplicate bytes: {:,}".format(dup_bytes)) if __name__=="__main__": from argparse import ArgumentParser from copy import deepcopy parser = ArgumentParser(description='Report information about a DFXML file') parser.add_argument('xmlfiles',help='XML files to process',nargs='+') parser.add_argument("--imagefile",help="specifies imagefile to examine; automatically runs fiwalk",nargs='+') args = parser.parse_args() ds = DiskSet() for fn in args.xmlfiles: print("Processing {}".format(fn)) dfxml.read_dfxml(xmlfile=open(fn,'rb'),callback=ds.pass1) ds.print_dups_report()
#!/usr/bin/env python3.2 # # Demo program that shows how to calculate the average size of file objects in a DFXML file # import dfxml,math,sys import collections sums = collections.Counter() sum_of_squares= collections.Counter() count = collections.Counter() def func(fi): ext = fi.ext() count[ext] += 1 sums[ext] += fi.filesize() sum_of_squares[ext] = fi.filesize() ** 2 dfxml.read_dfxml(xmlfile=open(sys.argv[1],'rb'),callback=func) fmt = "{:8} {:8} {:8} {:8} {:8}" print(fmt.format("Ext","Count","Total","Average","StdDev")) for ext in sums.keys(): print(fmt.format(ext, count[ext], sums[ext], sums[ext]/count[ext], math.sqrt(sum_of_squares[ext]/count[ext] - (sums[ext]/count[ext])**2)))
def process(self, fname): self.fname = fname dfxml.read_dfxml(xmlfile=open(fname, 'rb'), callback=self.process_fi)
hs_name = hs_name + '.csv' print("\n>>> Writing HashSet to:", hs_name) csv_out = open(hs_name, 'w', newline='') writer = csv.writer(csv_out) # Write out HashSet header writer.writerow(["%%%% HASHDEEP-1.0"]) if args.sha1: writer.writerow(["%%%% size","md5","sha1","filename"]) else: writer.writerow(["%%%% size","md5","filename"]) writer.writerow(["## HashSet created from " + args.dfxml]) writer.writerow(["## HashSet created using dfxml2hashdeep.py"]) # Set up lists for HashSet values size = [] md5 = [] filename = [] if args.sha1: sha1 = [] # Read DFXML file for metadata values dfxml.read_dfxml(xmlfile=open(args.dfxml,'rb'),callback=process_file) # Write out the lists to the HashSet if args.sha1: for row in zip(size,md5,sha1,filename): writer.writerow(row) else: for row in zip(size,md5,filename): writer.writerow(row) csv_out.close()
if(__name__=="__main__"): from argparse import ArgumentParser parser = ArgumentParser() parser.usage =\ """ dfxml_xtract.py [args] dfxml imagefile Using the metadata from DFXML file, extract each file object from a target image file. """ parser.add_argument("dfxml",help="Specify the target DFXML file") parser.add_argument("image",help="Specify the target image file") args = parser.parse_args() # Set up output directory for extracted files dfxml_path = os.path.abspath(args.dfxml) image_path = os.path.abspath(args.image) dir_name = os.getcwd() + "/output" if os.path.isdir(dir_name): os.chdir(dir_name) else: os.mkdir('output') os.chdir('output') print("Working Dir :", os.getcwd()) print("Target DFXML:", dfxml_path) print("Target IMAGE:", image_path) dfxml.read_dfxml(xmlfile=open(dfxml_path,'rb'),callback=extract_file)
def read(self, f): if type(f) == str: self.fname = f f = open(f, 'rb') dfxml.read_dfxml(xmlfile=f, callback=self.pass1)
parser.add_argument("--prefix", type=str, help="Only output files with the given prefix") parser.add_argument("--distinct", action='store_true', help='Report the distinct files') parser.add_argument( "--dups", action='store_true', help='Report the files that are dups, and give dup count') args = parser.parse_args() dobj = dedup() try: dfxml.read_dfxml(open(args.dfxml, 'rb'), callback=dobj.process) except xml.parsers.expat.ExpatError: pass print("Total files: {:,} total MD5s processed: {:,} Unique MD5s: {:,}". format(dobj.files, dobj.md5s, len(dobj.seen))) if args.distinct: def report_distinct(names): if args.prefix and not names[0].startswith(args.prefix): return print("distinct: ", names[0]) dobj.report(lambda names: len(names) == 1, report_distinct) if args.dups:
# This software was developed at the National Institute of Standards # and Technology by employees of the Federal Government in the course # of their official duties. Pursuant to title 17 Section 105 of the # United States Code this software is not subject to copyright # protection and is in the public domain. NIST assumes no # responsibility whatsoever for its use by other parties, and makes # no guarantees, expressed or implied, about its quality, # reliability, or any other characteristic. # # We would appreciate acknowledgement if the software is used. """ This script confirms that the DFXML pip-managed packaging exposes the dfxml package and the objects.py module. """ import sys import dfxml import dfxml.objects def nop(x): pass with open(sys.argv[1], "rb") as fh: dfxml.read_dfxml(fh, callback=nop) for (event, obj) in dfxml.objects.iterparse(sys.argv[1]): pass