def WordParser(url, data, headers, config, attributes):
    """Convert a Word document to HTML and returns a PluckerTextDocument"""

    # retrieve config information
    worddoc_converter = config.get_string('worddoc_converter')
    if worddoc_converter is None:
        message(0, "Could not find Word conversion command")
        return None

    check = os.path.basename(worddoc_converter)
    (check, ext) = os.path.splitext(check)
    check = string.lower(check)

    if check == 'wvware':
        # need to save data to a local file
        tempbase = tempfile.mktemp()
        tempdoc = os.path.join(tempfile.tempdir, tempbase + ".doc")
        try:
            file = open(tempdoc, "wb")
            file.write(data)
            file.close()
        except IOError, text:
            message(0, "Error saving temporary file %s" % tempdoc)
            return None

        # then convert it > local.html
        temphtml = os.path.join(tempfile.tempdir, tempbase + ".html")
        command = worddoc_converter
        command = command + " -d " + tempfile.tempdir + " -b " + os.path.join(
            tempfile.tempdir, tempbase)
        command = command + " " + tempdoc + " > " + temphtml
        try:
            if os.system(command):
                message(0, "Error running Word converter %s" % command)
                return None
        except:
            message(0, "Exception running word converter %s" % command)
            return None

        # then load the local.html file to data2
        try:
            file = open(temphtml, "rb")
            data2 = file.read()
            file.close()
        except IOError, text:
            message(0, "Error reading temporary file %s" % temphtml)
            return None
def WordParser (url, data, headers, config, attributes):
    """Convert a Word document to HTML and returns a PluckerTextDocument"""

    # retrieve config information
    worddoc_converter = config.get_string('worddoc_converter')
    if worddoc_converter is None:
        message(0, "Could not find Word conversion command")
        return None

    check = os.path.basename (worddoc_converter)
    (check, ext) = os.path.splitext (check)
    check = string.lower (check)

    if check == 'wvware':
        # need to save data to a local file
        tempbase = tempfile.mktemp()
        tempdoc = os.path.join(tempfile.tempdir, tempbase + ".doc")
        try:
            file = open (tempdoc, "wb")
            file.write (data)
            file.close ()
        except IOError, text:
            message(0, "Error saving temporary file %s" % tempdoc)
            return None

        # then convert it > local.html
        temphtml = os.path.join(tempfile.tempdir, tempbase + ".html")
        command = worddoc_converter
        command = command + " -d " + tempfile.tempdir + " -b " + os.path.join(tempfile.tempdir, tempbase)
        command = command + " " + tempdoc + " > " + temphtml
        try:
            if os.system (command):
                message(0, "Error running Word converter %s" % command)
                return None
        except:
            message(0, "Exception running word converter %s" % command)
            return None

        # then load the local.html file to data2
        try:
            file = open (temphtml, "rb")
            data2 = file.read ()
            file.close ()
        except IOError, text:
            message(0, "Error reading temporary file %s" % temphtml)
            return None
Esempio n. 3
0
def generic_parser(url, headers, data, config, attributes):
    try:
        url = str(url)  # convert to string if this is still a Url.ULR
        type = headers["content-type"]
        verbosity = config.get_int("verbosity", 1)
        if type == "unknown/unknown" and attributes.has_key("type"):
            # note that this type is not an HTTP header, and may not contain parameters
            type = attributes["type"]
        if type == "text/html":
            parser = TextParser.StructuredHTMLParser(url, data, headers, config, attributes)
            for item in parser.get_unknown():
                if unknown_things.has_key(item):
                    unknown_things[item].append(url)
                else:
                    unknown_things[item] = [url]
            return parser.get_plucker_doc()
        # DRS 2004-12-29
        # pretend message/rfc822 is really text
        elif type == "text/plain" or type == "message/rfc822":
            parser = TextParser.PlainTextParser(url, data, headers, config, attributes)
            return parser.get_plucker_doc()
        elif type == "mailto/text":
            # These are easy to handle, the document does it itself, so no
            # parsing needed as we generate the document directly
            return PluckerDocs.PluckerMailtoDocument(url)
        elif type[:6] == "image/":
            # this can fail, as some parsers do not recognize all image types...
            parser = ImageParser.get_default_parser(config)
            parsed = parser(url, type, data, config, attributes)
            return parsed.get_plucker_doc()
        elif type[:18] == "application/msword":
            return WordParser(url, data, headers, config, attributes)
        else:
            message(0, "%s type not yet handled (%s)" % (type, url))
            return None
    except RuntimeError, text:
        error("Runtime error parsing document %s: %s" % (url, text))
        return None
Esempio n. 4
0
def generic_parser(url, headers, data, config, attributes):
    try:
        url = str(url)  # convert to string if this is still a Url.ULR
        type = headers['content-type']
        verbosity = config.get_int('verbosity', 1)
        if type == 'unknown/unknown' and attributes.has_key('type'):
            # note that this type is not an HTTP header, and may not contain parameters
            type = attributes['type']
        if type == "text/html":
            parser = TextParser.StructuredHTMLParser(url, data, headers,
                                                     config, attributes)
            for item in parser.get_unknown():
                if unknown_things.has_key(item):
                    unknown_things[item].append(url)
                else:
                    unknown_things[item] = [url]
            return parser.get_plucker_doc()
        elif type == "text/plain":
            parser = TextParser.PlainTextParser(url, data, headers, config,
                                                attributes)
            return parser.get_plucker_doc()
        elif type == "mailto/text":
            # These are easy to handle, the document does it itself, so no
            # parsing needed as we generate the document directly
            return PluckerDocs.PluckerMailtoDocument(url)
        elif type[:6] == "image/":
            # this can fail, as some parsers do not recognize all image types...
            parser = ImageParser.get_default_parser(config)
            parsed = parser(url, type, data, config, attributes)
            return parsed.get_plucker_doc()
        elif type[:18] == "application/msword":
            return WordParser(url, data, headers, config, attributes)
        else:
            message(0, "%s type not yet handled" % type)
            return None
    except RuntimeError, text:
        error("Runtime error parsing document %s: %s" % (url, text))
        return None
Esempio n. 5
0
    def _retrieve(self, url, alias_list, post_data):
        """Really retrieve the url."""
        if url.get_protocol() == "plucker":
            return self._retrieve_plucker(url, alias_list)

        elif url.get_protocol() == "mailto":
            # Nothing to fetch really...
            return (
                {"URL": url, "error code": 0, "error text": "OK", "content-type": "mailto/text", "content-length": 0},
                "",
            )

        else:
            # not a plucker:... URL
            try:
                real_url = str(url)
                webdoc = self._urlopener.open(real_url, post_data)
                if hasattr(webdoc, "retcode"):
                    headers_dict = {"URL": real_url, "error code": webdoc.retcode, "error text": webdoc.retmessage}
                    doc_info = webdoc.info()
                    if doc_info is not None:
                        # This should always be a dict, but some people found None... :-(
                        headers_dict.update(doc_info.dict)
                    return (headers_dict, None)
                if hasattr(webdoc, "url"):
                    #######################################################################
                    # Redhat 7.x default Python installation will return                  #
                    # webdoc.url without a protocol at the beginning                      #
                    # (e.g. ://www.xyz.com instead of http://www.xyz.com).                #
                    # This is due to a bug in RH's /usr/lib/python1.5/urllib.py.          #
                    # [email protected]                                                #
                    #######################################################################
                    ################################################
                    # On Windows we wan't use                      #
                    # URL(url).get_protocol to get the protokoll   #
                    # urllib.splittype(url) and all other url      #
                    # manipuling funktions are too buggy           #
                    ################################################

                    if sys.platform == "win32":
                        from PyPlucker.Url import URL

                        webdoc_protocol = URL(webdoc.url).get_protocol
                    else:
                        (webdoc_protocol, webdoc_rest_of_url) = urllib.splittype(webdoc.url)

                    # check to see we have a valid URL; if not, use one we started with
                    if webdoc_protocol:
                        real_url = webdoc.url

                headers_dict = {"URL": real_url}
                doc_info = webdoc.info()
                message(3, "doc_info is %s", doc_info)
                if doc_info is not None:
                    # This should always be a dict, but some people found None... :-(
                    headers_dict.update(doc_info.dict)
                if not headers_dict.has_key("content-type"):
                    message(1, "Guessing type for %s" % url.get_path())
                    headers_dict["content-type"] = GuessType(url.get_path())
                else:
                    ctype, parameters = parse_http_header_value(headers_dict["content-type"])
                    headers_dict["content-type"] = ctype
                    for parm in parameters:
                        headers_dict[parm[0]] = parm[1]

                message(3, "headers_dict is %s", headers_dict)

                # Now get the contents
                contents = webdoc.read()

                # Check if encoded contents...
                if headers_dict.has_key("content-encoding"):
                    encoding = headers_dict["content-encoding"]
                    if encoding == "gzip" and _have_gzip:
                        s = StringIO.StringIO(contents)
                        g = gzip.GzipFile(fileobj=s)
                        c = g.read()
                        g.close()
                        contents = c
                    else:
                        return (
                            {
                                "URL": real_url,
                                "error code": 404,
                                "error text": "Unhandled content-encoding '%s'" % encoding,
                            },
                            None,
                        )

            except IOError, text:
                return ({"URL": real_url, "error code": 404, "error text": text}, None)
            except OSError, text:
                return ({"URL": real_url, "error code": 404, "error text": text}, None)
    def _retrieve (self, url, alias_list, post_data):
        """Really retrieve the url."""
        if url.get_protocol () == 'plucker':
            return self._retrieve_plucker (url, alias_list)

        elif url.get_protocol () == 'mailto':
            # Nothing to fetch really...
            return ({'URL': url,
                     'error code': 0,
                     'error text': "OK",
                     'content-type': "mailto/text",
                     'content-length': 0},
                     "")

        else:
            # not a plucker:... URL
            try:
                real_url = str (url)
                webdoc = self._urlopener.open (real_url, post_data)
                if hasattr (webdoc, 'retcode'):
                    headers_dict = {'URL': real_url,
                                    'error code': webdoc.retcode,
                                    'error text': webdoc.retmessage}
                    doc_info = webdoc.info ()
                    if doc_info is not None:
                        # This should always be a dict, but some people found None... :-(
                        headers_dict.update (doc_info.dict)
                    return (headers_dict, None)
                if hasattr (webdoc, 'url'):
                    #######################################################################
                    # Redhat 7.x default Python installation will return                  #
                    # webdoc.url without a protocol at the beginning                      #
                    # (e.g. ://www.xyz.com instead of http://www.xyz.com).                #
                    # This is due to a bug in RH's /usr/lib/python1.5/urllib.py.          #
                    # [email protected]                                                #
                    #######################################################################
                      ################################################
                      # On Windows we wan't use                      #
                      # URL(url).get_protocol to get the protokoll   #
                      # urllib.splittype(url) and all other url      #
                      # manipuling funktions are too buggy           #
                      ################################################



                    if sys.platform == 'win32':
                        from PyPlucker.Url import URL
                        webdoc_protocol = URL(webdoc.url).get_protocol
                    else:
                        (webdoc_protocol, webdoc_rest_of_url) = urllib.splittype(webdoc.url)

                    # check to see we have a valid URL; if not, use one we started with
                    if webdoc_protocol:
                        real_url = webdoc.url

                headers_dict = {'URL': real_url}
                doc_info = webdoc.info ()
                message(3, "doc_info is %s", doc_info);
                if doc_info is not None:
                    # This should always be a dict, but some people found None... :-(
                    headers_dict.update (doc_info.dict)
                if not headers_dict.has_key ('content-type'):
                    message (1, "Guessing type for %s" % url.get_path ())
                    headers_dict['content-type'] = GuessType (url.get_path ())
                else:
                    ctype, parameters = parse_http_header_value(headers_dict['content-type'])
                    headers_dict['content-type'] = ctype
                    for parm in parameters:
                        headers_dict[parm[0]] = parm[1]

                message(3, "headers_dict is %s", headers_dict);

                # Now get the contents
                contents = webdoc.read ()

                # Check if encoded contents...
                if headers_dict.has_key ('content-encoding'):
                    encoding = headers_dict['content-encoding']
                    if encoding == 'gzip' and _have_gzip:
                        s = StringIO.StringIO (contents)
                        g = gzip.GzipFile (fileobj=s)
                        c = g.read ()
                        g.close ()
                        contents = c
                    else:
                        return ({'URL': real_url,
                                 'error code': 404,
                                 'error text': "Unhandled content-encoding '%s'" % encoding},
                                None)

            except IOError, text:
                return ({'URL': real_url,
                         'error code': 404,
                         'error text': text},
                        None)
	    except OSError, text:
                return ({'URL': real_url,
                         'error code': 404,
                         'error text': text},
                        None)