Exemple #1
0
    def parse(self, html, cols):

        if not isinstance(html, unicode): raise UnicodeInputRequired

        utf8html = html.encode('utf8')
        tf_name = _write_to_and_return_tempfile_name(utf8html)

        # Replace cols marker:
        f = lambda x: ((x == ['cols']) and str(cols)) or x
        # Replace filename marker:
        g = lambda x: ((x == ['filename']) and tf_name) or x

        commandline_list = _mapmany([f, g], self.commandline_list)
        commandline = ''.join(commandline_list)

        # Run the process using popen3; possibly dodgy on Windows!
        # Need popen3 rather other popen function because we want to
        # grab stderr and hide it from the clients console.

        (stdin, stdout, stderr) = os.popen3(commandline, 'r')

        utf8output = stdout.read()
        exit_status = stdout.close()
        _remove_tempfile(tf_name)

        # Just in case the parser outputs bogus utf8:

        # Check the return code:
        if exit_status is not None: raise HTMLParsingFailed

        # Convert back to unicode object and return:
        try:
            output = unicode(utf8output, 'utf8')
            return output
        except (LookupError, UnicodeError):
            raise HTMLParsingFailed
    def parse(self, html, cols):

        if not isinstance(html, unicode): raise UnicodeInputRequired

        utf8html = html.encode('utf8')
        tf_name = _write_to_and_return_tempfile_name(utf8html)

        # Replace cols marker:
        f = lambda x: ((x == ['cols']) and str(cols)) or x
        # Replace filename marker:
        g = lambda x: ((x == ['filename']) and tf_name) or x

        commandline_list = _mapmany([f,g], self.commandline_list)
        commandline = ''.join(commandline_list)

        # Run the process using popen3; possibly dodgy on Windows!
        # Need popen3 rather other popen function because we want to
        # grab stderr and hide it from the clients console.

        (stdin, stdout, stderr) = os.popen3(commandline, 'r')

        utf8output = stdout.read()
        exit_status = stdout.close()
        _remove_tempfile(tf_name)

        # Just in case the parser outputs bogus utf8:

        # Check the return code:
        if exit_status is not None: raise HTMLParsingFailed

        # Convert back to unicode object and return:
        try:
            output = unicode(utf8output, 'utf8')
            return output
        except (LookupError, UnicodeError):
            raise HTMLParsingFailed
def calculate_filename_ext_libmagic(filename=None, file=None):

    # See comments in magic/magic.ext for details of the format
    # of the data file. All file extensions if recognized by a magic
    # test will be returned in the form "file_ext:{xyz}"; this lets us
    # detect the "file_ext:{}" marker and know we have a successful
    # guess at the correct extension. The reason we need this marker
    # is that libmagic has many tests whose return value is not
    # governed through the magic data file and so we need some way of
    # being sure a file extension has been returned. eg:

    # >>> magician.file('/etc/init.d')
    # "symbolic link to `rc.d/init.d'"

    if filename is None and file is None: raise ValueError('at least one of file or content_type must be specified')
    if not _got_magic: raise ImportError('magic module did not import successfully')

    magician = magic.open(magic.MAGIC_NONE)

    ret_load = magician.load()

    # Throw private error if the magic data file is corrupt, or
    # doesn't exist.

    if ret_load != 0: raise _MagicDataError()

    if filename is None:
        # then we have only been given file as binary string.

        # Get a temporary file and write file variable out to it
        # because the magic module expects to be handed the name of a
        # real file.

        tf, tf_name = _open_tempfile(mode='wb')
        tf.write(file)
        tf.close()

        delete_file = True
    else:
        os.stat(filename) # Make sure we can stat the file.
        tf_name = filename
        delete_file = False

    ext_info = magician.file(tf_name)

    # Now process ext_info to see if we can find a file extension
    # contained in it.

    file_ext_re = re.compile(r'file_ext:{(.+?)}')
    file_ext_match = file_ext_re.search(ext_info)

    if file_ext_match:
        name_ext = file_ext_match.group(1)

        # See if we have a compressed file type we can deal
        # with. If so, uncompress it and call ourself to get more
        # info:

        # Note that we could use the magic.MAGIC_COMPRESS flag to
        # get libmagic to do the decompression for us but:
        # 1. It only supports gzip
        # 2. The implementation has a nasty bug which has only
        #    been fixed in very recent releases of libmagic.

        if name_ext == 'gz':

            try:
                # Decompress the stream:
                decomp_file = gzip.open(tf_name).read()
            except zlib.error:
                # Couldn't decompress sucessfully, so just stick
                # with extension we have.
                pass
            else:
                # Guess an extension of the decompressed stream and
                # tack current '.gz' on the end:
                name_ext = calculate_filename_ext_libmagic(file=decomp_file)  + '.' + name_ext

        elif name_ext == 'bz2':

            try:
                # Decompress the file:
                if not _got_bz2:
                    raise ImportError('Failed to import bz2 module.')
                decomp_file = bz2.BZ2File(tf_name).read()
            except IOError:
                # Couldn't decompress sucessfully, so just stick
                # with extension we have.
                pass
            else:
                # Guess an extension of the decompressed stream and
                # tack current '.bz2' on the end:
                name_ext = calculate_filename_ext_libmagic(file=decomp_file)  + '.' + name_ext

    # Otherwise, look for special results from libmagic's
    # 'internal tests' that we recognize:

    elif ext_info.lower().rfind('tar archive') != -1:
        name_ext = 'tar'

    elif ext_info.lower().rfind('text') != -1:
        name_ext = 'txt'

    # Can't guess a filetype so use generic extension .dat

    else:
        name_ext = 'dat'

    # Identification done so get rid of the temp file, assuming we created the file:
    if delete_file: _remove_tempfile(tf_name)

    return name_ext
def calculate_filename_ext_libmagic(filename=None, file=None):

    # See comments in magic/magic.ext for details of the format
    # of the data file. All file extensions if recognized by a magic
    # test will be returned in the form "file_ext:{xyz}"; this lets us
    # detect the "file_ext:{}" marker and know we have a successful
    # guess at the correct extension. The reason we need this marker
    # is that libmagic has many tests whose return value is not
    # governed through the magic data file and so we need some way of
    # being sure a file extension has been returned. eg:

    # >>> magician.file('/etc/init.d')
    # "symbolic link to `rc.d/init.d'"

    if filename is None and file is None:
        raise ValueError(
            'at least one of file or content_type must be specified')
    if not _got_magic:
        raise ImportError('magic module did not import successfully')

    magician = magic.open(magic.MAGIC_NONE)

    ret_load = magician.load()

    # Throw private error if the magic data file is corrupt, or
    # doesn't exist.

    if ret_load != 0: raise _MagicDataError()

    if filename is None:
        # then we have only been given file as binary string.

        # Get a temporary file and write file variable out to it
        # because the magic module expects to be handed the name of a
        # real file.

        tf, tf_name = _open_tempfile(mode='wb')
        tf.write(file)
        tf.close()

        delete_file = True
    else:
        os.stat(filename)  # Make sure we can stat the file.
        tf_name = filename
        delete_file = False

    ext_info = magician.file(tf_name)

    # Now process ext_info to see if we can find a file extension
    # contained in it.

    file_ext_re = re.compile(r'file_ext:{(.+?)}')
    file_ext_match = file_ext_re.search(ext_info)

    if file_ext_match:
        name_ext = file_ext_match.group(1)

        # See if we have a compressed file type we can deal
        # with. If so, uncompress it and call ourself to get more
        # info:

        # Note that we could use the magic.MAGIC_COMPRESS flag to
        # get libmagic to do the decompression for us but:
        # 1. It only supports gzip
        # 2. The implementation has a nasty bug which has only
        #    been fixed in very recent releases of libmagic.

        if name_ext == 'gz':

            try:
                # Decompress the stream:
                decomp_file = gzip.open(tf_name).read()
            except zlib.error:
                # Couldn't decompress sucessfully, so just stick
                # with extension we have.
                pass
            else:
                # Guess an extension of the decompressed stream and
                # tack current '.gz' on the end:
                name_ext = calculate_filename_ext_libmagic(
                    file=decomp_file) + '.' + name_ext

        elif name_ext == 'bz2':

            try:
                # Decompress the file:
                if not _got_bz2:
                    raise ImportError('Failed to import bz2 module.')
                decomp_file = bz2.BZ2File(tf_name).read()
            except IOError:
                # Couldn't decompress sucessfully, so just stick
                # with extension we have.
                pass
            else:
                # Guess an extension of the decompressed stream and
                # tack current '.bz2' on the end:
                name_ext = calculate_filename_ext_libmagic(
                    file=decomp_file) + '.' + name_ext

    # Otherwise, look for special results from libmagic's
    # 'internal tests' that we recognize:

    elif ext_info.lower().rfind('tar archive') != -1:
        name_ext = 'tar'

    elif ext_info.lower().rfind('text') != -1:
        name_ext = 'txt'

    # Can't guess a filetype so use generic extension .dat

    else:
        name_ext = 'dat'

    # Identification done so get rid of the temp file, assuming we created the file:
    if delete_file: _remove_tempfile(tf_name)

    return name_ext