def processFileReturn(filename, display_filename=False, priority=None, human=True, display=True): charset = getTerminalCharset() # filename, real_filename = unicode(filename, charset), filename if type(filename) == str: filename, real_filename = unicodeFilename(filename, charset), filename else: real_filename = filename.encode(getTerminalCharset()) try: parser = createParser(filename, real_filename=real_filename, tags=None) except InputStreamError, err: error(unicode(err)) try: del(parser) except: pass return False
def main(): if len(sys.argv) != 2: print >>sys.stderr, "usage: %s directory" % sys.argv[0] sys.exit(1) charset = getTerminalCharset() directory = unicode(sys.argv[1], charset) print "Download and check Hachoir testcase." print print "Use directory: %s" % directory ok = testFiles(directory, TESTCASE_URL) if not stringMD5("abc"): print for index in xrange(3): print "!!! Warning: Python module md5 is missing, unable to check MD5 hash" if ok: print totalsize = sum( item[1] for item in testcase_files ) print "Test case is ok (%s files, %s)" % (len(testcase_files), humanFilesize(totalsize)) sys.exit(0) else: print for index in xrange(3): print "!!! ERROR !!!" print sys.exit(1)
def unicodeFilename(filename, charset=None): if not charset: charset = getTerminalCharset() try: return unicode(filename, charset) except UnicodeDecodeError: return makePrintable(filename, charset, to_unicode=True)
def main(): setlocale(LC_ALL, "C") if len(sys.argv) != 2: print >>sys.stderr, "usage: %s testcase_directory" % sys.argv[0] sys.exit(1) charset = getTerminalCharset() directory = unicode(sys.argv[1], charset) print "Test hachoir-parser using random data." print if not testRandom(): print print "If you are really sure there is no error in your code," \ " increment the 'seed' parameter of testRandom." sys.exit(1) print "Result: ok" print print "Test hachoir-parser using testcase." print print "Testcase is in directory: %s" % directory if not testFiles(directory): print for index in xrange(3): print "!!! ERROR !!!" print sys.exit(1) print print "Result: ok for the %s files" % len(testcase_files)
def main(): setlocale(LC_ALL, "C") if len(sys.argv) != 2: print >> sys.stderr, "usage: %s testcase_directory" % sys.argv[0] sys.exit(1) charset = getTerminalCharset() directory = unicode(sys.argv[1], charset) print "Test hachoir-parser using random data." print if not testRandom(): print print "If you are really sure there is no error in your code," \ " increment the 'seed' parameter of testRandom." sys.exit(1) print "Result: ok" print print "Test hachoir-parser using testcase." print print "Testcase is in directory: %s" % directory if not testFiles(directory): print for index in xrange(3): print "!!! ERROR !!!" print sys.exit(1) print print "Result: ok for the %s files" % len(testcase_files)
def main(): if len(sys.argv) != 2: print >> sys.stderr, "usage: %s directory" % sys.argv[0] sys.exit(1) charset = getTerminalCharset() directory = unicode(sys.argv[1], charset) print "Download and check Hachoir testcase." print print "Use directory: %s" % directory ok = testFiles(directory, TESTCASE_URL) if not stringMD5("abc"): print for index in xrange(3): print "!!! Warning: Python module md5 is missing, unable to check MD5 hash" if ok: print totalsize = sum(item[1] for item in testcase_files) print "Test case is ok (%s files, %s)" % (len(testcase_files), humanFilesize(totalsize)) sys.exit(0) else: print for index in xrange(3): print "!!! ERROR !!!" print sys.exit(1)
def FileInputStream(filename, real_filename=None, **args): """ Create an input stream of a file. filename must be unicode. real_filename is an optional argument used to specify the real filename, its type can be 'str' or 'unicode'. Use real_filename when you are not able to convert filename to real unicode string (ie. you have to use unicode(name, 'replace') or unicode(name, 'ignore')). """ assert isinstance(filename, unicode) if not real_filename: real_filename = filename try: inputio = FileOpener(real_filename, 'rb') except IOError as err: charset = getTerminalCharset() errmsg = unicode(str(err), charset) raise InputStreamError( _("Unable to open file %s: %s") % (filename, errmsg)) source = "file:" + filename offset = args.pop("offset", 0) size = args.pop("size", None) if offset or size: if size: size = 8 * size stream = InputIOStream(inputio, source=source, **args) return InputSubStream(stream, 8 * offset, size, **args) else: args.setdefault("tags", []).append(("filename", filename)) return InputIOStream(inputio, source=source, **args)
def googlesearch(): print "Searching google for files..." # set up browser browse = mechanize.Browser() cookiejar = cookielib.LWPCookieJar() browse.set_cookiejar(cookiejar) browse.set_handle_equiv(True) browse.set_handle_redirect(True) browse.set_handle_referer(True) browse.set_handle_robots(False) browse.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) browse.addheaders = [ ( "User-agent", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1", ) ] # response = browse.open("https://www.google.com/#q=filetype: %s + %s" % (filetype, domain)) for filetype in ["doc", "docx", "ppt", "xls"]: response = browse.open("https://www.google.com") browse.select_form(nr=0) browse.form["q"] = "filetype:%s site:%s" % (filetype, domain) browse.submit() results = browse.response().read() soup = BeautifulSoup(results, "lxml") sidlist = [] namelist = [] typelist = [] metalist = [] counter = 1 for link in soup.find_all("a", href=re.compile("/url")): link = link.get("href") if link.startswith("/url?q="): link = link[len("/url?q=") :] link = link.split("." + filetype)[0] # print str(link + ".pdf") filename = "%s%s.%s" % (domain, counter, filetype) try: downfile = browse.retrieve(str(link + "." + filetype), filename)[0] filename = downfile filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) metadata = extractMetadata(parser) text = metadata.exportPlaintext() charset = getTerminalCharset() sidlist.append(sid) typelist.append(str(filetype)) namelist.append(str(filename)) metalist.append(str(text)) counter += 1 except: pass for meta in zip(sidlist, typelist, namelist, metalist): executor.execute("INSERT INTO metadata VALUES (?,?,?,?)", meta) # for line in text: # print makePrintable(line, charset) connection.commit()
def print_metadata(metadata): text = metadata.exportPlaintext() charset = getTerminalCharset() for line in text: pass # print makePrintable(line, charset) # from http://stackoverflow.com/questions/14546533/hachoir-retrieving-data-from-a-group # See what keys you can extract for k, v in metadata._Metadata__data.iteritems(): if v.values: print v.key, v.values[0].value
def processFileReturn(filename, display_filename=False, priority=None, human=True, display=True): charset = getTerminalCharset() # filename, real_filename = unicode(filename, charset), filename if type(filename) == str: filename, real_filename = unicodeFilename(filename, charset), filename else: real_filename = filename.encode(getTerminalCharset()) try: parser = createParser(filename, real_filename=real_filename, tags=None) except InputStreamError, err: error(unicode(err)) try: del (parser) except: pass return False
def processFile(filename, quality=0.5): charset = getTerminalCharset() filename, real_filename = unicodeFilename(filename, charset), filename # Create parser try: tags = None parser = createParser(filename, real_filename=real_filename, tags=tags) except InputStreamError, err: error(unicode(err)) return False
class metaMs2k: def __init__(self,filename): self.filename=filename self.users=[] self.paths=[] self.software=[] self.modification=[] self.creationDate=[] self.lastPrinted=[] self.raw="" def getData(self): filename, realname = unicodeFilename(self.filename), self.filename try: parser = createParser(filename, realname) except: return "error" try: metadata = extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None if not metadata: print "Unable to extract metadata on file: " + self.filename else: text = metadata.exportPlaintext() charset = getTerminalCharset() for line in text: res=line.split(":") if res[0]=="- Author": self.users.append(res[1]) elif res[1]==" Author:": self.users.append(res[2]) elif res[0]=="- Producer": self.software.append(res[1]) elif res[0]=="- Creation date": self.creationDate.append(res[1]) elif res[0]=="- Last modification": self.modification.append(res[1]) elif res[1]==" Template": xres= line.replace("- Comment: Template:","") self.paths.append(xres) elif res[1]==" LastSavedBy": # print res[1] + res[2] self.users.append(res[2]) elif res[1]==" LastPrinted": self.lastPrinted.append(res[2]) elif res[0]=="- Revision history": #self.paths.append(res[2]) res2=line.split(",") self.paths.append(res2[1].split("file ")[1]) self.raw=text return "ok"
def run(self): for filename in os.listdir(self.adir): filename = os.path.join(self.adir, filename) if os.path.isdir(filename): while 1: try: Walk(filename).start() break except: continue elif os.path.isfile(filename) and is_song(filename): filename, realname = unicodeFilename(filename), filename try: song = Song.objects.get(filename = filename) except: song = Song(filename = filename, name = os.path.splitext(os.path.basename(filename))[0]) if not has_changed(song): continue song.stat = stat(filename) try: parser = createParser(filename, realname) except: parser = None if not parser: print >>stderr, "Unable to parse file %s"%filename continue try: metadata = extractMetadata(parser) except HachoirError, err: print >>stderr, "Metadata extraction error: %s" % unicode(err) continue if not metadata: print >>stderr, "Unable to extract metadata" continue else: text = metadata.exportPlaintext() charset = getTerminalCharset() for line in text[1:]: line = makePrintable(line, charset) key = line[2:].split(': ')[0].replace(' ','_').replace('/','_').lower() if key in COLS: setattr(song,key,line[len(key)+4:]) while 1: try: song.save() break except: continue
def processFile(values, filename, display_filename=False, priority=None, human=True, display=True): charset = getTerminalCharset() filename, real_filename = unicodeFilename(filename, charset), filename # Create parser try: if values.force_parser: tags = [ ("id", values.force_parser), None ] else: tags = None parser = createParser(filename, real_filename=real_filename, tags=tags) except InputStreamError, err: error(unicode(err)) return False
def __init__(self, input, size=None, **args): if not hasattr(input, "seek"): if size is None: input = InputPipe(input, self._setSize) else: input = InputPipe(input) elif size is None: try: input.seek(0, 2) size = input.tell() * 8 except IOError, err: if err.errno == ESPIPE: input = InputPipe(input, self._setSize) else: charset = getTerminalCharset() errmsg = unicode(str(err), charset) source = args.get("source", "<inputio:%r>" % input) raise InputStreamError(_("Unable to get size of %s: %s") % (source, errmsg))
def FileInputStream(filename, real_filename=None, **args): """ Create an input stream of a file. filename must be unicode. real_filename is an optional argument used to specify the real filename, its type can be 'str' or 'unicode'. Use real_filename when you are not able to convert filename to real unicode string (ie. you have to use unicode(name, 'replace') or unicode(name, 'ignore')). """ assert isinstance(filename, unicode) if not real_filename: real_filename = filename try: inputio = FileOpener(real_filename, 'rb') except IOError, err: charset = getTerminalCharset() errmsg = unicode(str(err), charset) raise InputStreamError(_("Unable to open file %s: %s") % (filename, errmsg))
def processFile(values, filename, display_filename=False, priority=None, human=True, display=True): charset = getTerminalCharset() filename, real_filename = unicodeFilename(filename, charset), filename # Create parser try: if values.force_parser: tags = [("id", values.force_parser), None] else: tags = None parser = createParser(filename, real_filename=real_filename, tags=tags) except InputStreamError, err: error(unicode(err)) return False
def main(): if len(argv) != 2: print >>stderr, "usage: %s video.flv" % argv[0] exit(1) # Open input video inputname = unicode(argv[1], getTerminalCharset()) parser = createParser(inputname) if parser["audio[0]/codec"].value != AUDIO_CODEC_MP3: print >>stderr, "Unknown audio codec: %s" % parser["audio[0]/codec"].display # Extract audio print "Extractor audio from: %s" % inputname outputname = inputname + ".mp3" output = FileOutputStream(outputname) for chunk in parser.array("audio"): data = chunk["music_data"] output.copyBitsFrom(data.parent.stream, data.absolute_address, data.size, data.parent.endian) print "Write audio into: %s" % outputname
def FileInputStream(filename, real_filename=None, **args): """ Create an input stream of a file. filename must be unicode. real_filename is an optional argument used to specify the real filename, its type can be 'str' or 'unicode'. Use real_filename when you are not able to convert filename to real unicode string (ie. you have to use unicode(name, 'replace') or unicode(name, 'ignore')). """ assert isinstance(filename, unicode) if not real_filename: real_filename = filename try: inputio = FileOpener(real_filename, 'rb') except IOError, err: charset = getTerminalCharset() errmsg = unicode(str(err), charset) raise InputStreamError( _("Unable to open file %s: %s") % (filename, errmsg))
def getmeta(tempfile): try: filename = tempfile filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print >>stderr, "Unable to parse file" return "error" try: metadata = extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None if not metadata: print "Unable to extract metadata" return "error" text = metadata.exportPlaintext() charset = getTerminalCharset() return text
def getmeta(tempfile): try: filename = tempfile filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print >> stderr, "Unable to parse file" return "error" try: metadata = extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None if not metadata: print "Unable to extract metadata" return "error" text = metadata.exportPlaintext() charset = getTerminalCharset() return text
def main(): setlocale(LC_ALL, "C") if len(sys.argv) != 2: print >> sys.stderr, "usage: %s testcase_directory" % sys.argv[0] sys.exit(1) charset = getTerminalCharset() directory = unicode(sys.argv[1], charset) print "Test hachoir-metadata using testcase." print print "Testcase is in directory: %s" % directory ok = testFiles(directory) if ok: print print "Result: ok for the %s files" % len(testcase_files) sys.exit(0) else: print for index in xrange(3): print "!!! ERROR !!!" print sys.exit(1)
def main(): setlocale(LC_ALL, "C") if len(sys.argv) != 2: print >>sys.stderr, "usage: %s testcase_directory" % sys.argv[0] sys.exit(1) charset = getTerminalCharset() directory = unicode(sys.argv[1], charset) print "Test hachoir-metadata using testcase." print print "Testcase is in directory: %s" % directory ok = testFiles(directory) if ok: print print "Result: ok for the %s files" % len(testcase_files) sys.exit(0) else: print for index in xrange(3): print "!!! ERROR !!!" print sys.exit(1)
# (c) Raif Sarcich 2011 GPLv3 # Utility functions for paths import os, sys, os.path, time, string, re import urllib2, urlparse from pieberry.pieconfig.config import PIE_CONFIG from pieberry.pieconfig.paths import * from pieberry.pieconfig.schemas import FEXTENSIONS from pieberry.pieutility.decoding import * from hachoir_core.i18n import getTerminalCharset charset = getTerminalCharset() def get_session(source=None): '''return a session code for dealing with grouped objects''' if source == 'desktop': return 'd_%s' % str(int(time.time())) else: return 'w_%s' % str(int(time.time())) def auto_increment_fn(fn): counter = 0 dn = os.path.dirname(fn) bn = os.path.splitext(os.path.basename(fn))[0] ext = os.path.splitext(os.path.basename(fn))[1] while os.path.exists(fn): counter += 1 fn = os.path.join(dn, '%s_%d%s' % (bn, counter, ext)) if counter == 1000: raise 'auto_increment_fn: Too many files - giving up' return fn
def exploreFieldSet(field_set, args, options={}): charset = getTerminalCharset() ui = urwid.curses_display.Screen() ui.register_palette(( ('focus', 'white', 'dark blue'), ('sep', 'white', 'dark red'), ('input', 'black', 'light gray'), )) msgs = [[],[],0] hachoir_log.use_print = False def logger(level, prefix, text, ctxt): if ctxt is not None: c = [] if hasattr(ctxt, "_logger"): c[:0] = [ ctxt._logger() ] if issubclass(ctxt.__class__, Field): ctxt = ctxt["/"] name = logger.objects.get(ctxt) if name: c[:0] = [ name ] if c: text = "[%s] %s" % ('|'.join(c), text) if not isinstance(text, unicode): text = unicode(text, charset) msgs[0].append((level, prefix, text)) logger.objects = WeakKeyDictionary() hachoir_log.on_new_message = logger preload_fields = 1 + max(0, args.preload) log_count = [ 0, 0, 0 ] sep = Separator("log: %%u/%%u/%%u | %s " % _("F1: help")) sep.set_info(*tuple(log_count)) body = Tabbed(sep) help = ('help', ListBox([ Text(getHelpMessage()) ])) logger.objects[field_set] = logger.objects[field_set.stream] = name = u'root' body.append((name, TreeBox(charset, Node(field_set, None), preload_fields, args.path, options))) log = BoxAdapter(ListBox(msgs[1]), 0) log.selectable = lambda: False wrapped_sep = AttrWrap(sep, 'sep') footer = Pile([ ('flow', wrapped_sep), log ]) # awful way to allow the user to hide the log widget log.render = lambda size, focus=False: BoxAdapter.render(log, size[:1], focus) footer.render = lambda (maxcol,), focus=False: Pile.render(footer, (maxcol, sep.rows((maxcol,))+log.height), focus) top = Frame(body, None, footer) def input_enter(w): footer.widget_list[0] = w footer.set_focus(0) top.set_focus('footer') def input_leave(): footer.widget_list[0] = wrapped_sep footer.set_focus(0) top.set_focus('body') input = Input(input_enter, input_leave) def run(): msg = _resize = retry = 0 events = ( "window resize", ) profile_display = args.profile_display while True: for e in events: try: if e == "window resize": size = ui.get_cols_rows() resize = log.height else: e = top.keypress(size, e) if e is None: pass elif e in ('f1', '?'): try: body.select(body.tabs.index(help)) except ValueError: body.append(help) resize = log.height elif e in ('esc', 'ctrl w'): body.close() if body.box_widget is None: return resize = log.height elif e == '+': if log.height: resize = log.height - 1 elif e == '-': resize = log.height + 1 elif e == 'q': return #except AssertionError: # hachoir_log.error(getBacktrace()) except NewTab_Stream, e: stream = e.field.getSubIStream() logger.objects[stream] = e = "%u/%s" % (body.active, e.field.absolute_address) parser = guessParser(stream) if not parser: hachoir_log.error(_("No parser found for %s") % stream.source) else: logger.objects[parser] = e body.append((e, TreeBox(charset, Node(parser, None), preload_fields, None, options))) resize = log.height except NeedInput, e: input.do(*e.args) if profile_display: events = events[1:] break while True: if msgs[0]: for level, prefix, text in msgs[0]: log_count[level] += 1 txt = Text("[%u]%s %s" % (msg, prefix, text)) msg += 1 msgs[1].append(txt) _resize += txt.rows(size[:1]) if log.height < _resize and (resize is None or resize < _resize): resize = _resize log.set_focus(len(msgs[1])-1) sep.set_info(*tuple(log_count)) msgs[0] = [] if resize is not None: body.height = size[1] - sep.rows(size[:1]) - resize if body.height <= 0: resize += body.height - 1 body.height = 1 log.height = resize resize = None canvas = top.render(size, focus=True) if not msgs[0]: _resize = retry = 0 break assert not retry retry += 1 ui.draw_screen(size, canvas) msgs[2] = len(msgs[1]) if profile_display and events: continue while True: events = ui.get_input() if events: break
def separate_files(self): self.form2.update_progress("Getting file types", 10) #Feature Extraction from images self.form2.update_message( "Extracting features from images. Please wait") i = 0 j = 0 k = 0 l = 0 for file in self.file_list: #file=file.replace("\ "," ") file_type = self.magic.file(file).split(";")[0] if 'image' in file_type: i = i + 1 #print file+":"+'image' self.form2.update_value(1) subprocess.call([ 'python', '/home/nsk/Project/caffe/examples/caffe_example.py', file ]) if 'audio' in file_type: j = j + 1 #update while processing self.form2.update_value(1) #self.tb_metadata.append("**** Metadata for: "+file+" ****") filename = file filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print >> stderr, "Unable to parse file" continue #exit(1) try: metadata = extractMetadata(parser) except Exception as err: print "Metadata extraction error: %s" % unicode(err) metadata = None if not metadata: print "Unable to extract metadata" continue #exit(1) text = metadata.exportPlaintext() charset = getTerminalCharset() #create corresponding text file value_string = file key_string = "/home/nsk/Desktop/Start/Begin1/Files/" + value_string.split( "/")[-1].split(".")[0] + "." + "txt" #write metadata to the file fp = open(key_string, "wb+") for line in text: fp.write(line) fp.close() #write the key, value to the database key_string_1 = key_string.replace(".", ";") value_string_1 = value_string.replace(".", ";") a = Database("MuDaM") a.set_collection("Files") a.add_entry(key_string_1, value_string_1) a.close() if 'text' in file_type: k = k + 1 path = "/home/nsk/Desktop/Start/Begin1/Files/" path2 = file.split("/")[-1] dest_path = path + path2 src_path = file shutil.copy2(src_path, dest_path) key_string = dest_path.replace(".", ";") value_string = src_path.replace(".", ";") a = Database("MuDaM") a.set_collection("Files") a.add_entry(key_string, value_string) a.close() if 'video' in file_type: l = l + 1 self.form2.update_message("Analysed " + str(i) + " images") self.form2.update_message("Analysed " + str(j) + " audio files") self.form2.update_message("Detected " + str(k) + " text files") self.form2.update_message("Analysed " + str(l) + " video files") self.form2.update_progress("Creating Database entries", 10) self.form2.update_progress("Running clustering alogrithm", 30) subprocess.call(['Rscript', '/home/nsk/Desktop/Start/Begin1/Final.r']) self.form2.hide() self.form3.test() self.form3.show()
from hachoir_parser import createParser def metadata_for(filename): filename, realname = unicodeFilename(filename), filename parser = createParser(filename, realname) if not parser: print "Unable to parse file" exit(1) try: metadata = extractMetadata(parser) except HachoirError, err: print "Metadata extraction error: %s" % unicode(err) metadata = None if not metadata: print "Unable to extract metadata" exit(1) text = metadata.exportPlaintext() charset = getTerminalCharset() for line in text: print makePrintable(line, charset) return metadata def extract_data(metadata): for data in sorted(metadata): if len(data.values) > 0: print data.key, data.values[0].value
def _parse(self, filename): self._fields_parse( 0, createParser(unicode(filename, getTerminalCharset())))
def _parse(self, filename): self._fields_parse(0, createParser(unicode(filename, getTerminalCharset())))