def mangleData(self, data, index): self.sha1_offset = 208 self.md5_offset = 256 self.header_offset = 360 self.filedata_offset = 3170 data = MangleFile.mangleData(self, data, index) if USE_HACHOIR: #data.tofile(open('/tmp/oops', 'wb')) hachoir_config.quiet = True data_str = data.tostring() parser = guessParser(StringInputStream(data_str)) if parser: self.useHachoirParser(parser) summary_data = data[self.header_offset:].tostring() checksum = md5(summary_data).digest() data[self.md5_offset:self.md5_offset + 16] = array('B', checksum) summary_data = data[self.header_offset:self.filedata_offset].tostring() checksum = sha(summary_data).hexdigest() data[self.sha1_offset:self.sha1_offset + 40] = array('B', checksum) return data
def __init__(self, project, *args, **kw): MangleFile.__init__(self, project, args[0], int(kw['nb_file'])) self.hard_max_op = 10000 self.hard_min_op = 0 self.aggressivity = None self.fixed_size_factor = 1.0 if kw.has_key('ext'): if kw['ext'] in ('.xml', '.svg', '.rdf'): if kw.has_key('nofile') and kw['nofile']: self.xml = MangleXML(nofile=True, xmltype=MangleXML.SVG11) else: self.xml = MangleXML(xmltype=MangleXML.SVG11) elif kw['ext'] == ".html": if kw.has_key('nofile'): self.xml = MangleXML(nofile=kw['nofile'], xmltype=MangleXML.XHTML1) else: self.xml = MangleXML(xmltype=MangleXML.XHTML1) else: self.xml = None
def setupProject(project): USE_STDOUT = True time = ProcessTimeWatch( project, too_slow=3.0, too_slow_score=0.10, too_fast=0.100, too_fast_score=-0.80, ) orig_filename = project.application().getInputFilename("PDF document") if AUTO_MANGLE: mangle = AutoMangle(project, orig_filename) mangle.hard_max_op = 1000 else: mangle = MangleFile(project, orig_filename) mangle.config.max_op = 1000 options = {'timeout': 5.0} if not USE_STDOUT: options['stdout'] = 'null' process = PopplerProcess(project, ['pdftotext'], **options) WatchProcess(process, exitcode_score=-0.10) if USE_STDOUT: stdout = WatchStdout(process) def cleanupLine(line): match = re.match(r"Error(?: \([0-9]+\))?: (.*)", line) if match: line = match.group(1) return line stdout.cleanup_func = cleanupLine del stdout.words['unknown'] # stdout.show_not_matching = True # stdout.ignoreRegex(r"Unknown operator 'allocate'$") # stdout.ignoreRegex(r" operator is wrong type \(error\)$") # stdout.ignoreRegex(r'^No current point in lineto$') # stdout.ignoreRegex(r'^No current point in lineto') # stdout.ignoreRegex(r'^Unknown operator ') # stdout.ignoreRegex(r"^Couldn't open 'nameToUnicode' file ") # stdout.ignoreRegex(r"^Illegal character ") # stdout.ignoreRegex(r"^No font in show$") # stdout.ignoreRegex(r"^Element of show/space array must be number or string$") # stdout.ignoreRegex(r"^No current point in curveto$") # stdout.ignoreRegex(r"^Badly formatted number$") # stdout.ignoreRegex(r"^Dictionary key must be a name object$") # stdout.ignoreRegex(r"^End of file inside array$") # stdout.ignoreRegex(r"^Too few \([0-9]+\) args to .* operator$") # stdout.ignoreRegex(r"Too many args in content stream") stdout.max_nb_line = (100, 0.20)
def mangleData(self, data, file_index): self.setupConf(data) if self.xml: return array('B', self.xml.mangleData(data.tostring())) return MangleFile.mangleData(self, data, file_index)
def __init__(self, project, *args, **kw): MangleFile.__init__(self, project, *args, **kw) self.hard_max_op = 10000 self.hard_min_op = 0 self.aggressivity = None self.fixed_size_factor = 1.0
def mangleData(self, data, file_index): self.setupConf(data) return MangleFile.mangleData(self, data, file_index)