def parse(self, html, cols): if not isinstance(html, unicode): raise UnicodeInputRequired utf8html = html.encode('utf8') tf_name = _write_to_and_return_tempfile_name(utf8html) # Replace cols marker: f = lambda x: ((x == ['cols']) and str(cols)) or x # Replace filename marker: g = lambda x: ((x == ['filename']) and tf_name) or x commandline_list = _mapmany([f, g], self.commandline_list) commandline = ''.join(commandline_list) # Run the process using popen3; possibly dodgy on Windows! # Need popen3 rather other popen function because we want to # grab stderr and hide it from the clients console. (stdin, stdout, stderr) = os.popen3(commandline, 'r') utf8output = stdout.read() exit_status = stdout.close() _remove_tempfile(tf_name) # Just in case the parser outputs bogus utf8: # Check the return code: if exit_status is not None: raise HTMLParsingFailed # Convert back to unicode object and return: try: output = unicode(utf8output, 'utf8') return output except (LookupError, UnicodeError): raise HTMLParsingFailed
def parse(self, html, cols): if not isinstance(html, unicode): raise UnicodeInputRequired utf8html = html.encode('utf8') tf_name = _write_to_and_return_tempfile_name(utf8html) # Replace cols marker: f = lambda x: ((x == ['cols']) and str(cols)) or x # Replace filename marker: g = lambda x: ((x == ['filename']) and tf_name) or x commandline_list = _mapmany([f,g], self.commandline_list) commandline = ''.join(commandline_list) # Run the process using popen3; possibly dodgy on Windows! # Need popen3 rather other popen function because we want to # grab stderr and hide it from the clients console. (stdin, stdout, stderr) = os.popen3(commandline, 'r') utf8output = stdout.read() exit_status = stdout.close() _remove_tempfile(tf_name) # Just in case the parser outputs bogus utf8: # Check the return code: if exit_status is not None: raise HTMLParsingFailed # Convert back to unicode object and return: try: output = unicode(utf8output, 'utf8') return output except (LookupError, UnicodeError): raise HTMLParsingFailed
def calculate_filename_ext_libmagic(filename=None, file=None): # See comments in magic/magic.ext for details of the format # of the data file. All file extensions if recognized by a magic # test will be returned in the form "file_ext:{xyz}"; this lets us # detect the "file_ext:{}" marker and know we have a successful # guess at the correct extension. The reason we need this marker # is that libmagic has many tests whose return value is not # governed through the magic data file and so we need some way of # being sure a file extension has been returned. eg: # >>> magician.file('/etc/init.d') # "symbolic link to `rc.d/init.d'" if filename is None and file is None: raise ValueError('at least one of file or content_type must be specified') if not _got_magic: raise ImportError('magic module did not import successfully') magician = magic.open(magic.MAGIC_NONE) ret_load = magician.load() # Throw private error if the magic data file is corrupt, or # doesn't exist. if ret_load != 0: raise _MagicDataError() if filename is None: # then we have only been given file as binary string. # Get a temporary file and write file variable out to it # because the magic module expects to be handed the name of a # real file. tf, tf_name = _open_tempfile(mode='wb') tf.write(file) tf.close() delete_file = True else: os.stat(filename) # Make sure we can stat the file. tf_name = filename delete_file = False ext_info = magician.file(tf_name) # Now process ext_info to see if we can find a file extension # contained in it. file_ext_re = re.compile(r'file_ext:{(.+?)}') file_ext_match = file_ext_re.search(ext_info) if file_ext_match: name_ext = file_ext_match.group(1) # See if we have a compressed file type we can deal # with. If so, uncompress it and call ourself to get more # info: # Note that we could use the magic.MAGIC_COMPRESS flag to # get libmagic to do the decompression for us but: # 1. It only supports gzip # 2. The implementation has a nasty bug which has only # been fixed in very recent releases of libmagic. if name_ext == 'gz': try: # Decompress the stream: decomp_file = gzip.open(tf_name).read() except zlib.error: # Couldn't decompress sucessfully, so just stick # with extension we have. pass else: # Guess an extension of the decompressed stream and # tack current '.gz' on the end: name_ext = calculate_filename_ext_libmagic(file=decomp_file) + '.' + name_ext elif name_ext == 'bz2': try: # Decompress the file: if not _got_bz2: raise ImportError('Failed to import bz2 module.') decomp_file = bz2.BZ2File(tf_name).read() except IOError: # Couldn't decompress sucessfully, so just stick # with extension we have. pass else: # Guess an extension of the decompressed stream and # tack current '.bz2' on the end: name_ext = calculate_filename_ext_libmagic(file=decomp_file) + '.' + name_ext # Otherwise, look for special results from libmagic's # 'internal tests' that we recognize: elif ext_info.lower().rfind('tar archive') != -1: name_ext = 'tar' elif ext_info.lower().rfind('text') != -1: name_ext = 'txt' # Can't guess a filetype so use generic extension .dat else: name_ext = 'dat' # Identification done so get rid of the temp file, assuming we created the file: if delete_file: _remove_tempfile(tf_name) return name_ext
def calculate_filename_ext_libmagic(filename=None, file=None): # See comments in magic/magic.ext for details of the format # of the data file. All file extensions if recognized by a magic # test will be returned in the form "file_ext:{xyz}"; this lets us # detect the "file_ext:{}" marker and know we have a successful # guess at the correct extension. The reason we need this marker # is that libmagic has many tests whose return value is not # governed through the magic data file and so we need some way of # being sure a file extension has been returned. eg: # >>> magician.file('/etc/init.d') # "symbolic link to `rc.d/init.d'" if filename is None and file is None: raise ValueError( 'at least one of file or content_type must be specified') if not _got_magic: raise ImportError('magic module did not import successfully') magician = magic.open(magic.MAGIC_NONE) ret_load = magician.load() # Throw private error if the magic data file is corrupt, or # doesn't exist. if ret_load != 0: raise _MagicDataError() if filename is None: # then we have only been given file as binary string. # Get a temporary file and write file variable out to it # because the magic module expects to be handed the name of a # real file. tf, tf_name = _open_tempfile(mode='wb') tf.write(file) tf.close() delete_file = True else: os.stat(filename) # Make sure we can stat the file. tf_name = filename delete_file = False ext_info = magician.file(tf_name) # Now process ext_info to see if we can find a file extension # contained in it. file_ext_re = re.compile(r'file_ext:{(.+?)}') file_ext_match = file_ext_re.search(ext_info) if file_ext_match: name_ext = file_ext_match.group(1) # See if we have a compressed file type we can deal # with. If so, uncompress it and call ourself to get more # info: # Note that we could use the magic.MAGIC_COMPRESS flag to # get libmagic to do the decompression for us but: # 1. It only supports gzip # 2. The implementation has a nasty bug which has only # been fixed in very recent releases of libmagic. if name_ext == 'gz': try: # Decompress the stream: decomp_file = gzip.open(tf_name).read() except zlib.error: # Couldn't decompress sucessfully, so just stick # with extension we have. pass else: # Guess an extension of the decompressed stream and # tack current '.gz' on the end: name_ext = calculate_filename_ext_libmagic( file=decomp_file) + '.' + name_ext elif name_ext == 'bz2': try: # Decompress the file: if not _got_bz2: raise ImportError('Failed to import bz2 module.') decomp_file = bz2.BZ2File(tf_name).read() except IOError: # Couldn't decompress sucessfully, so just stick # with extension we have. pass else: # Guess an extension of the decompressed stream and # tack current '.bz2' on the end: name_ext = calculate_filename_ext_libmagic( file=decomp_file) + '.' + name_ext # Otherwise, look for special results from libmagic's # 'internal tests' that we recognize: elif ext_info.lower().rfind('tar archive') != -1: name_ext = 'tar' elif ext_info.lower().rfind('text') != -1: name_ext = 'txt' # Can't guess a filetype so use generic extension .dat else: name_ext = 'dat' # Identification done so get rid of the temp file, assuming we created the file: if delete_file: _remove_tempfile(tf_name) return name_ext