def get_creation_date(_path): """ Simple function to retrieve the creation date from the file's metdata Args: _path the full path to the file. """ # Initialise result _creation_date = None # Using the hachoir metadata library retrieve file metadata hachoir_config.quiet = True try: parser = createParser(unicodeFilename(_path), _path) if parser: metadata = extractMetadata(parser) if metadata: _creation_date = metadata.get("creation_date") except Exception: pass # Validate and use ctime if not available if not _creation_date: _ctime = os.path.getctime(_path) _creation_date = datetime.datetime.fromtimestamp(_ctime) # Return result return _creation_date
def lnkparse(reflectPath, filename): """ Return the target filename from a MS-widows link (URL format) """ filename = unicodeFilename(filename) try: parser = createParser(filename) if parser is not None and isinstance(parser, LnkFile): #It is a "MS-Windows" link file try: for field in parser: pass # trigger parsing lnkpath = parser.getField('relative_path').value # mount the complet target path,analyses if inside BasePath if lnkpath.startswith('.\\'): lnkpath = lnkpath[2:] lnkpath = lnkpath.replace('\\','/') filenamePath = os.path.dirname(filename) allLnkpath = os.path.join(reflectPath, filenamePath, lnkpath) allLnkpath = os.path.abspath(allLnkpath) #remove all ..\ if allLnkpath.startswith(reflectPath): lnkpath = quote(lnkpath.encode('utf-8')) return 'OK', lnkpath else: return 'ERROR_OUTREFLECTPATH', '' except MissingField: # example: link to a network file return 'ERROR_RELPATH', '' else: return 'NOT_LNKFILE', '' except InputStreamError: return 'NOT_PARSED', ''
def Downloadfile(url): infoMeta = [] file_name = url.split('/')[-1] infoMeta.append(file_name) u = urllib2.urlopen(url) meta = u.info() infoMeta.append(meta.headers) doc = u.read() f = open(file_name, 'wb') f.write(doc) with open(file_name, 'rb') as p: # Slurp the whole file and efficiently convert it to hex all at once hexdata = binascii.hexlify(p.read()) # use hachoir to add the standard metadata filename = './' + file_name print filename filename, realname = unicodeFilename(filename), filename parser = createParser(filename) try: metalist = metadata.extractMetadata(parser).exportPlaintext() infoMeta.append(metalist[1:4]) except Exception: infoMeta.append(["none", "none", "none"]) p.close() # print "Done", file_name, " Info is ", infoMeta return file_name, hexdata
def save_response_binaries(self, path, hash_value): try: flow = Flow.objects.get(hash_value=hash_value) flow_details = flow.details for detail in flow_details: # create the orig file ex: contents_192.168.1.5:42825-62.212.84.227:80_resp.dat source_str = ":".join([detail.src_ip, str(detail.sport)]) destination_str = ":".join([detail.dst_ip, str(detail.dport)]) flow_str = "-".join([source_str, destination_str]) resp_file = "_".join(["contents", flow_str,"resp.dat"]) file_path = "/".join([path, resp_file]) file_path = str(file_path) try: stream = FileInputStream(unicodeFilename(file_path), real_filename=file_path) except NullStreamError: continue subfile = SearchSubfile(stream, 0, None) subfile.loadParsers() root = "/".join([path, "html-files"]) if not os.path.exists(root): os.makedirs(root) output = "/".join([root, flow_str]) output = str(output) if not os.path.exists(output): os.mkdir(output) subfile.setOutput(output) ok = subfile.main() # save the files info at the db also return True except Exception, ex: return False
def get_file_metadata(path): rdata = {} if os.path.isfile(path): try: parser = createParser(unicodeFilename(path), path) rdata["size"] = os.stat(path).st_size if parser: try: metadata = extractMetadata(parser) if metadata: rdata.update( (md.key, md.values[0].value if len(md.values) == 1 else [value.value for value in md.values] ) for md in metadata if md.values ) except HachoirError as e: logging.exception(e) except NullStreamError: rdata["size"] = 0 except BaseException as e: logging.exception(e) finally: if parser and parser.stream and parser.stream._input and not parser.stream._input.closed: parser.stream._input.close() return rdata
def subfile(self, filePath): # hachoir-subfile is a tool based on hachoir-parser to find subfiles in any binary stream. # Website: http://bitbucket.org/haypo/hachoir/wiki/hachoir-subfile # bypass sys.stdout, sys.stderr oldStdOut = sys.stdout oldStdErr = sys.stderr outputStdErr = StringIO.StringIO() outputStdOut = StringIO.StringIO() sys.stdout = outputStdOut sys.stderr = outputStdErr stream = FileInputStream(unicodeFilename(filePath), real_filename=filePath) # Search for subfiles subfile = SearchSubfile(stream, 0, None) subfile.loadParsers(categories=None, parser_ids=None) subfile.main() # sys.stdout, sys.stderr reset sys.stdout = oldStdOut sys.stderr = oldStdErr # parse stdout, stderr from SearchSubfile return self.parse(outputStdOut.getvalue(), outputStdErr.getvalue())
def _parallel_task(work_queue, progress, args, run_stats): while True: try: src_file = work_queue.get(True, QUEUE_TIMEOUT_SEC) Partition.handle_file(src_file, args.src_dir, args.dest_dir, \ not args.no_dry_run, args.flatten_subdirectories, run_stats) run_stats.count_success() except Queue.Empty: LOG.error("No more files to process. Exiting.") except: LOG.exception("Unexpected error processing file %s: %s", src_file, sys.exc_info()[0]) run_stats.count_failure() finally: work_queue.task_done() try: #TODO: I think this may break with Unicode filenames progress.set_postfix(file=unicodeFilename( os.path.basename(src_file)), refresh=False) progress.update(1) except: LOG.exception("Error updating progress bar for source file %s: %s", src_file, sys.exc_info()[0])
def get_file_date(root, file): date = "" try: filename = "{}/{}".format(root,file) filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print >>stderr, "Unable to parse file {}".format(filename) try: actualstderr = sys.stderr sys.stderr = open(os.devnull,'w') metadata = extractMetadata(parser) sys.stderr = actualstderr except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None if not metadata: print "Unable to extract metadata, {}".format(filename) text = metadata.exportPlaintext() date = "" # Tracer()() for line in text: if line[0:10] == "- Creation": match = re.search('(\d+-\d+-\d+ \d+:\d+:\d+)', line) if match: date = time.strptime(match.groups()[0], '%Y-%m-%d %H:%M:%S') return date
def classify(path,rootdir): # add an extra argument here to take the root dir :) print 'path given: ', path,' RootDir: ',rootdir foo = path.rsplit('/', 1) fname = foo[1] # defaults audio, video: artist = album = genre = 'unknown' # defaults image: latitude = longitude = 0 city = state = country = 'unknown' year = '1960' month = 'January' # here we go : filename = path filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print >>stderr, "Unable to parse file" exit(1) try: metadata = extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None
def lnkparse(reflectPath, filename): """ Return the target filename from a MS-widows link (URL format) """ filename = unicodeFilename(filename) try: parser = createParser(filename) if parser is not None and isinstance(parser, LnkFile): #It is a "MS-Windows" link file try: for field in parser: pass # trigger parsing lnkpath = parser.getField('relative_path').value # mount the complet target path,analyses if inside BasePath if lnkpath.startswith('.\\'): lnkpath = lnkpath[2:] lnkpath = lnkpath.replace('\\', '/') filenamePath = os.path.dirname(filename) allLnkpath = os.path.join(reflectPath, filenamePath, lnkpath) allLnkpath = os.path.abspath(allLnkpath) #remove all ..\ if allLnkpath.startswith(reflectPath): lnkpath = quote(lnkpath.encode('utf-8')) return 'OK', lnkpath else: return 'ERROR_OUTREFLECTPATH', '' except MissingField: # example: link to a network file return 'ERROR_RELPATH', '' else: return 'NOT_LNKFILE', '' except InputStreamError: return 'NOT_PARSED', ''
def hachm(filename): # using this example http://archive.org/details/WorkToFishtestwmv try: filename, realname = unicodeFilename(filename), filename except TypeError: filename,realname=filename,filename parser = createParser(filename) # See what keys you can extract tmp = metadata.extractMetadata(parser) if tmp is None: return {} else: tmp = tmp._Metadata__data.iteritems() for k,v in tmp: if v.values: print v.key, v.values[0].value # Turn the tags into a defaultdict metalist = metadata.extractMetadata(parser).exportPlaintext() meta = defaultdict(defaultdict) if not metalist: return meta for item in metalist[1:]: item = [x.strip() for x in item.split('-') if x.strip()][0] item = [ x.strip().lower().replace(' ','_') for x in item.split(':') ] k,v = item.pop(0),':'.join(item) meta[k]=v return meta
def get_metadata(file_names): print ("- Analyzing files metadata.." + "\n") file_ = open('results.txt', 'w') file_extensions = [".3do", ".3ds", ".7z", ".a", ".ace", ".aif", ".aifc", ".aiff", ".ani", ".apm", ".asf", ".au", ".avi", ".bin", ".bmp", ".bz2", ".cab", ".cda", ".chm", ".class", ".cur", ".deb", ".der", ".dll", ".doc", ".dot", ".emf", ".exe", ".flv", ".gif", ".gz", ".ico", ".jar", ".jpeg", ".jpg", ".laf", ".lnk", ".m4a", ".m4b", ".m4p", ".m4v", ".mar", ".mid", ".midi", ".mka", ".mkv", ".mod", ".mov", ".mp1", ".mp2", ".mp3", ".mp4", ".mpa", ".mpe", ".mpeg", ".mpg", ".msi", ".nst", ".oct", ".ocx", ".odb", ".odc", ".odf", ".odg", ".odi", ".odm", ".odp", ".ods", ".odt", ".ogg", ".ogm", ".otg", ".otp", ".ots", ".ott", ".pcf", ".pcx", ".pdf", ".png", ".pot", ".pps", ".ppt", ".ppz", ".psd", ".ptm", ".pyo", ".qt", ".ra", ".rar", ".rm", ".rpm", ".s3m", ".sd0", ".snd", ".so", ".stc", ".std", ".sti", ".stw", ".swf", ".sxc", ".sxd", ".sxg", ".sxi", ".sxm", ".sxw", ".tar", ".tga", ".tif", ".tiff", ".torrent", ".ts", ".ttf", ".vob", ".wav", ".wma", ".wmf", ".wmv", ".wow", ".xcf", ".xla", ".xls", ".xm", ".zip", ".zs1", ".zs2", ".zs3", ".zs4", ".zs5", ".zs6", ".zs7", ".zs8", ".zs9", ".zst"] for filename in file_names: print ("- Extracting file metadata: " + filename + "\n") extension = os.path.splitext(filename) if extension[1] in file_extensions: print (" * The file extension is: " + extension[1] + "\n") filename, realname = unicodeFilename(filename), filename file_.write('Name: ') file_.write(filename) file_.write('\n') parser = createParser(filename, realname) if not parser: print >>stderr, "Error, parsing file" exit(1) try: metadata = extractMetadata(parser) except Exception as e: print ("Error extracting file metadata: " + str(e)) metadata = None if not metadata: print ("Metadata can not be extracted") exit(1) text = metadata.exportPlaintext() for line in text: file_.write(line) file_.write('\n') print (" * Successfull metadata extraction" + "\n" + "\n") if not extension[1] in file_extensions: print (" * File extension is unknown or not supported" + "\n" + "\n") return text file_.close()
def classify(path, rootdir): # add an extra argument here to take the root dir :) print 'path given: ', path, ' RootDir: ', rootdir foo = path.rsplit('/', 1) fname = foo[1] # defaults audio, video: artist = album = genre = 'unknown' # defaults image: latitude = longitude = 0 city = state = country = 'unknown' year = '1960' month = 'January' # here we go : filename = path filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print >> stderr, "Unable to parse file" exit(1) try: metadata = extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None
def get_meta(self, file_path): """ Get the meta information. """ self.check_extension(file_path) filename, realname = unicodeFilename(file_path), file_path parser = createParser(filename, realname) if parser is None: if file_path.lower().endswith('.mov'): return 'video/quicktime', 'null' if file_path.lower().endswith('.mpg'): return 'video/mpeg', 'null' if file_path.lower().endswith('.jpg'): return 'image/jpeg', 'null' if file_path.lower().endswith('.bup'): return 'video/dvd', 'null' if file_path.lower().endswith('.vob'): return 'video/dvd', 'null' if file_path.lower().endswith('.ifo'): return 'video/dvd', 'null' metadata = extractMetadata(parser) mime_type = parser.mime_type info = {} for data in sorted(metadata or ()): if not data.values: continue info[data.key] = [item.text for item in data.values] return mime_type, json.dumps(info)
def _guess_from_metadata(self): parse = lambda s: s.split(":") guesses = [] for filename in self.files: filename = get_filename(filename) if not isinstance(filename, unicode): filename, realname = unicodeFilename(filename), filename else: realname = filename parser = createParser(filename, realname) if parser: try: metadata = extractMetadata(parser) except HachoirError: continue for line in metadata.exportPlaintext(): entries = dict((parse(normalize(l)) for l in line if 'comment' in l or 'title' in l)) entries = dict(((k, guessit.guess_episode_info(v)) for (k, v) in entries.items())) if 'title' in entries: guesses.append(entries['title']) elif 'comment' in entries: guesses.append(entries['comment']) return guesses
def googlesearch(): print "Searching google for files..." # set up browser browse = mechanize.Browser() cookiejar = cookielib.LWPCookieJar() browse.set_cookiejar(cookiejar) browse.set_handle_equiv(True) browse.set_handle_redirect(True) browse.set_handle_referer(True) browse.set_handle_robots(False) browse.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) browse.addheaders = [ ( "User-agent", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1", ) ] # response = browse.open("https://www.google.com/#q=filetype: %s + %s" % (filetype, domain)) for filetype in ["doc", "docx", "ppt", "xls"]: response = browse.open("https://www.google.com") browse.select_form(nr=0) browse.form["q"] = "filetype:%s site:%s" % (filetype, domain) browse.submit() results = browse.response().read() soup = BeautifulSoup(results, "lxml") sidlist = [] namelist = [] typelist = [] metalist = [] counter = 1 for link in soup.find_all("a", href=re.compile("/url")): link = link.get("href") if link.startswith("/url?q="): link = link[len("/url?q=") :] link = link.split("." + filetype)[0] # print str(link + ".pdf") filename = "%s%s.%s" % (domain, counter, filetype) try: downfile = browse.retrieve(str(link + "." + filetype), filename)[0] filename = downfile filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) metadata = extractMetadata(parser) text = metadata.exportPlaintext() charset = getTerminalCharset() sidlist.append(sid) typelist.append(str(filetype)) namelist.append(str(filename)) metalist.append(str(text)) counter += 1 except: pass for meta in zip(sidlist, typelist, namelist, metalist): executor.execute("INSERT INTO metadata VALUES (?,?,?,?)", meta) # for line in text: # print makePrintable(line, charset) connection.commit()
def extract_title(filename): filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) metadata = extractMetadata(parser) text = metadata.exportPlaintext() return metadata.get('title')
def getData(self): filename, realname = unicodeFilename(self.filename), self.filename parser = createParser(filename, realname) try: metadata = extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None
def getMetadata(filename): filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print "Unable to parse file" exit(1) metadata = extractMetadata(parser) return metadata
def file_mimetype(filename): if filename and filename != "": result = Cigma().identify(filename=filename) return result["match"]["mimetype"] if result["match"] else None parser = createParser(unicodeFilename(filename), filename) return { "mimetype": str(parser.mime_type) } if parser else { "mimetype": "text/plain" }
def __get_hd_tag__(self, video): result = 104 file_metadata = extractMetadata(createParser(unicodeFilename(video))) if file_metadata.get('width') == 1280: result = 404 elif file_metadata.get('width') == 1920: result = 1604 return result
def get_hd_tag(self, video): result = 0 try: file_metadata = extractMetadata(createParser(unicodeFilename(video))) if file_metadata.get('width') == 1280: result = 1 elif file_metadata.get('width') == 1920: result = 2 finally: return result
def extract_metadata(self, file): config.MAX_STR_LENGTH = float("inf") try: filename = file.name if not isinstance(filename, unicode): filename = unicodeFilename(filename) stream = InputIOStream(file, source="file:%s" % filename, tags=[], filename=filename) parser = guessParser(stream) return extractMetadata(parser) except (HachoirError, TypeError) as e: raise MetadataException(e)
def extractInicioEFimDoVideo(filename): filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print >> stderr, "Falha ao converter arquivo." exit(1) try: metadata = extractMetadata(parser) except HachoirError, err: print "Falha na extração de metadado do arquivo: %s" % unicode(err) metadata = None
def metadata_for(filename): filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print "Unable to parse file" exit(1) try: metadata = extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None
def loadMetadata(self): """Load the metadata, either using Hachoir, ... either using mplayer""" if len(self.videoFile) != 0: filename = OP.join(self.videoPath, self.videoFile) filename, realname = unicodeFilename(filename), filename myParser = createParser(filename, realname) try: self.metadata = extractMetadata(myParser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) self.metadata = None
def metadata_for_video(filename): filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print "Unable to parse file" exit(1) try: metadata = extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None
def processFile(filename, quality=0.5): charset = getTerminalCharset() filename, real_filename = unicodeFilename(filename, charset), filename # Create parser try: tags = None parser = createParser(filename, real_filename=real_filename, tags=tags) except InputStreamError, err: error(unicode(err)) return False
def getMetadata(filename): filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) try: metadata = extractMetadata(parser) except: return None if metadata is not None: metadata = metadata.exportPlaintext() return metadata return None
def parse(self): filename, realname = unicodeFilename(self.filename), self.filename parser = hachoir_parser.createParser(filename, realname) if not parser: sys.stderr.write("Unable to parse file %s/n" % self.filename) return try: ## TODO Essa chamada da um warning quando nao ha gps data metadata = hachoir_metadata.extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None
def processFile(self, filename): filename, realname = unicodeFilename(filename), filename print u"[%s] Process file %s..." % (self.total, filename) parser = createParser(filename, realname) if not parser: print >> stderr, "Unable to parse file" return None try: metadata = extractMetadata(parser) except HachoirError, err: print >> stderr, "Metadata extraction error: %s" % unicode(err) return None
def parse(self): filename, realname = unicodeFilename(self.filename), self.filename parser = hachoir_parser.createParser(filename, realname) if not parser: sys.stderr.write("Unable to parse file %s/n"%self.filename) return try: ## TODO Essa chamada da um warning quando nao ha gps data metadata = hachoir_metadata.extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None
def get_movie_metadata(self, filename): filename, realname = unicodeFilename(filename), filename # parser = createParser(filename, realname) parser = createParser(filename, filename) if not parser: print >> stderr, "Unable to parse file" exit(1) try: metadata = extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None
def processFile(self, filename): filename, realname = unicodeFilename(filename), filename print u"[%s] Process file %s..." % (self.total, filename) parser = createParser(filename, realname) if not parser: print >>stderr, "Unable to parse file" return None try: metadata = extractMetadata(parser) except HachoirError, err: print >>stderr, "Metadata extraction error: %s" % unicode(err) return None
def search(self, file_path, strings=None): try: self.stream = FileInputStream(unicodeFilename(file_path), real_filename=file_path) except NullStreamError: return False patterns = PatternMatching() for s in strings: patterns.addString(s) start = 0 end = self.stream.size self.data = self.stream.readBytes(start, end//8) return patterns.search(self.data)
def main(filename="default_64.png"): filename, realname = unicodeFilename(filename), filename p = hachoir_parser.createParser(filename, realname) print "Fields: ", p._fields metadata = extractMetadata(p) print_metadata(metadata) fields = p._fields print "fields: ", fields for i in print_recursively(fields): print i
def metadata_for(filename): print '\nprinting metadata...\n' filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print("Unable to parse file") exit(1) try: metadata = extractMetadata(parser) except HachoirError, err: print("Metadata extraction error: %s" % unicode(err)) metadata = None
def __init__(self, filename, source_directory, destination_directory): # Call the Constructor of the super class super(VideoFile, self).__init__(filename, source_directory, destination_directory) if self.date_created == "": # Set the timezone data for processing the movie files from_zone = tz.gettz('UTC') to_zone = tz.gettz('America/Los_Angeles') filename, realname = unicodeFilename((self.source_directory + "/" + self.filename)), (self.source_directory + "/" + self.filename) parser = createParser(filename, realname) if not parser: print "ERROR... unable to parse file!" else: try: metadata = extractMetadata(parser) except (HachoirError, err): print "Metadata extraction error: %s" % unicode(err) metadata = None if not metadata: print "Unable to extract metadata" else: text = metadata.exportPlaintext() for line in text: # print line current_line = str(line)[2:15] movie_creation_date_and_time_utc = str(line)[17:len(line)] if current_line == "Creation date": # print "Current line: %s" % current_line # print "Found match... %s" % movie_creation_date_and_time_utc # Process the time extracted from the movie file by converting from # UTC time (Greenwich Mean Time) to the Pacific time zone utc = datetime.strptime(movie_creation_date_and_time_utc, '%Y-%m-%d %H:%M:%S') utc = utc.replace(tzinfo=from_zone) movie_creation_date_and_time_pacific = utc.astimezone(to_zone) # print "Time/Date: %s" % movie_creation_date_and_time_pacific # Extract the date from the processed Pacific time movie_creation_date = str(movie_creation_date_and_time_pacific)[0:10] self.date_created = movie_creation_date self.destination_directory += self.date_created + '/' # print "Video created on: %s" % self.date_created # print "Video dest dir: %s" % self.destination_directory else: self.destination_directory += self.date_created + '/'
def get_metadata(path): # Create a parser for the file parser = createParser(unicodeFilename(path), path) if not parser: raise ValueError('Unable to parse %r' % path) # Read the metadata try: metadata = extractMetadata(parser) except HachoirError as e: raise ValueError('Metadata extraction error: %s' % e) # Check that there really was metadata if not metadata: raise ValueError('Unable to extract metadata for %r' % path) return metadata
def get_hachoir_create_date(fname): """Get media create date using hachoir library""" global log retval = None filename, realname = unicodeFilename(fname), fname parser = createParser(filename, realname) if not parser: log.critical( 'Unable to parse file ' + fname) return retval try: metadata = extractMetadata(parser) except HachoirError, err: log.critical( 'Metadata extraction error for ' + fname + ' - '+ unicode(err)) metadata = None
def get_meta(filename): from hachoir_core.error import HachoirError from hachoir_core.cmd_line import unicodeFilename from hachoir_parser import createParser from hachoir_core.tools import makePrintable from hachoir_metadata import extractMetadata from hachoir_core.i18n import getTerminalCharset filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print >>sys.stderr, "{}: Unable to parse file".format(filename) return None return extractMetadata(parser)
def processFileReturn(filename, display_filename=False, priority=None, human=True, display=True): charset = getTerminalCharset() # filename, real_filename = unicode(filename, charset), filename if type(filename) == str: filename, real_filename = unicodeFilename(filename, charset), filename else: real_filename = filename.encode(getTerminalCharset()) try: parser = createParser(filename, real_filename=real_filename, tags=None) except InputStreamError, err: error(unicode(err)) try: del(parser) except: pass return False
def convert_gzip_files(self, path, hash_value): try: flow = Flow.objects.get(hash_value=hash_value) flow_details = flow.details for detail in flow_details: # create the orig file ex: contents_192.168.1.5:42825-62.212.84.227:80_resp.dat source_str = ":".join([detail.src_ip, str(detail.sport)]) destination_str = ":".join([detail.dst_ip, str(detail.dport)]) flow_str = "-".join([source_str, destination_str]) resp_file = "_".join(["contents", flow_str,"resp.dat"]) file_path = "/".join([path, resp_file]) # path is created as unicode, convert it a regular string for hachoir operation file_path = str(file_path) try: stream = FileInputStream(unicodeFilename(file_path), real_filename=file_path) except NullStreamError: continue subfile = SearchSubfile(stream, 0, None) subfile.loadParsers() root = "/".join([path, "html-files"]) if not os.path.exists(root): os.makedirs(root) output = "/".join([root, flow_str]) output = str(output) subfile.setOutput(output) http_details = filter(lambda x: x.flow_details.id == detail.id ,HTTPDetails.objects.filter(http_type="response")) file_ext = ".txt" for http in http_details: if http.content_type: filename = subfile.output.createFilename(file_ext) if http.content_encoding == "gzip": r = open("/".join([output, filename]), "r") body = r.read() r.close() data = StringIO.StringIO(body) gzipper = gzip.GzipFile(fileobj=data) html = gzipper.read() filename = filename.split(".")[0] + ".html" w = open("/".join([output, filename]), "w") w.write(html) w.close() return True except Exception, ex: print ex return False
def file_metadata(filename): if filename and filename != "": parser = createParser(unicodeFilename(filename), filename) meta = metadata.extractMetadata(parser) if parser else None metalist = meta.exportPlaintext() if meta else [] meta = collections.defaultdict(collections.defaultdict) for item in metalist: if item.endswith(":"): k = item[:-1] else: tag, value = item.split(": ", 1) tag = tag[2:] meta[k][tag] = value return unicode_to_string( default_to_regular(meta))["Metadata"] if meta else {}
def get_hachoir_create_date(fname): """Get media create date using hachoir library""" global log retval = None filename, realname = unicodeFilename(fname), fname parser = createParser(filename, realname) if not parser: log.critical('Unable to parse file ' + fname) return retval try: metadata = extractMetadata(parser) except HachoirError, err: log.critical('Metadata extraction error for ' + fname + ' - ' + unicode(err)) metadata = None
def _read_exif_hachoir(file_name): try: filename, realname = unicodeFilename(file_name), file_name parser = createParser(filename, realname) metadata = extractMetadata(parser) if metadata and metadata.has('creation_date'): exif = {} exif['creation_date'] = str(metadata.get('creation_date')) return exif else: LOG.warn('File %s did not have creation_date' % file_name) return {} except HachoirError, err: LOG.exception("Metadata extraction error: %s", unicode(err))
def processFile(values, filename, display_filename=False, priority=None, human=True, display=True): charset = getTerminalCharset() filename, real_filename = unicodeFilename(filename, charset), filename # Create parser try: if values.force_parser: tags = [("id", values.force_parser), None] else: tags = None parser = createParser(filename, real_filename=real_filename, tags=tags) except InputStreamError, err: error(unicode(err)) return False
def downloadBINARY(url): ########################################################### # USE TO DOWNLOAD A BINARY FILE LIKE DOC OR XLS # # INPUT: the url of the file. # # OUTPUT: the hex of the file, and list of some metadata, # # from the server and from a hachoir_metadata scan # # SAVES FILE TO: downloaded_docs/doc, or xls/filename # ########################################################### infoMeta=[] file_name = url.split('/')[-1] file_type = file_name.split(".")[-1] base_dir = os.path.abspath("../../../downloaded_docs/") download_dir = os.path.join(base_dir, file_type) infoMeta.append(file_type) infoMeta.append(file_name) u = urllib2.urlopen(url) meta = u.info() infoMeta.append(meta.headers) doc= u.read() f = open(os.path.join(download_dir,file_name), 'wb') f.write(doc) with open(os.path.join(download_dir,file_name), 'rb') as p: # Slurp the whole file and convert it to hex all at once hexdata = binascii.hexlify(p.read()) # use hachoir to add the standard metadata filename = download_dir+ '/'+file_name print filename # filename = unicodeFilename(filename), filename filename, realname = unicodeFilename(filename), filename parser = createParser(filename) try: metalist = metadata.extractMetadata(parser).exportPlaintext() infoMeta.append(metalist[1:4]) except Exception: infoMeta.append(["none","none","none"]) p.close() print "Done", file_name, " Saved to: ", download_dir return hexdata, infoMeta
def processFileReturn(filename, display_filename=False, priority=None, human=True, display=True): charset = getTerminalCharset() # filename, real_filename = unicode(filename, charset), filename if type(filename) == str: filename, real_filename = unicodeFilename(filename, charset), filename else: real_filename = filename.encode(getTerminalCharset()) try: parser = createParser(filename, real_filename=real_filename, tags=None) except InputStreamError, err: error(unicode(err)) try: del (parser) except: pass return False
def getmeta(tempfile): try: filename = tempfile filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print >> stderr, "Unable to parse file" return "error" try: metadata = extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None if not metadata: print "Unable to extract metadata" return "error" text = metadata.exportPlaintext() charset = getTerminalCharset() return text
def extract(self, file): ''' This function use to extract meta data from a file. We use hachoir3 library to extract them. (See more: http://hachoir3.readthedocs.org) :param : file - file for extract :return: meta data as dict for success, 0 if fail. ''' try: filename, realname = unicodeFilename(file), file parser = createParser(filename, realname) meta_data = extractMetadata(parser) meta_data_text = meta_data.exportPlaintext() meta_list = dict() for i in range(1, len(meta_data_text)): meta_split = meta_data_text[i].split(":") column = meta_split[0].replace('- ', '') value = meta_split[1].lstrip() meta_list.update({column: value}) return meta_list except: if self.debug: print "Something went wrong, meta data of", file, "could not extract." return None
def main(self): if len(argv) != 2: print >>stderr, "usage: %s document.swf" % argv[0] exit(1) realname = argv[1] filename = unicodeFilename(realname) parser = createParser(filename, real_filename=realname) if parser["signature"].value == "CWS": deflate_swf = parser["compressed_data"].getSubIStream() parser = guessParser(deflate_swf) if "jpg_table/data" in parser: # JPEG pictures with common header jpeg_header = parser["jpg_table/data"].value[:-2] for field in parser.array("def_bits"): jpeg_content = field["image"].value[2:] if self.verbose: print "Extract JPEG from %s" % field.path self.storeJPEG(jpeg_header + jpeg_content) # JPEG in format 2/3 for field in parser.array("def_bits_jpeg2"): self.extractFormat2(field) for field in parser.array("def_bits_jpeg3"): self.extractFormat2(field) # Extract sound #self.extractSound(parser) self.extractSound2(parser) # Does it extract anything? if self.jpg_index == 1: print "No JPEG picture found." if self.snd_index == 1: print "No sound found."