def WordParser(url, data, headers, config, attributes): """Convert a Word document to HTML and returns a PluckerTextDocument""" # retrieve config information worddoc_converter = config.get_string('worddoc_converter') if worddoc_converter is None: message(0, "Could not find Word conversion command") return None check = os.path.basename(worddoc_converter) (check, ext) = os.path.splitext(check) check = string.lower(check) if check == 'wvware': # need to save data to a local file tempbase = tempfile.mktemp() tempdoc = os.path.join(tempfile.tempdir, tempbase + ".doc") try: file = open(tempdoc, "wb") file.write(data) file.close() except IOError, text: message(0, "Error saving temporary file %s" % tempdoc) return None # then convert it > local.html temphtml = os.path.join(tempfile.tempdir, tempbase + ".html") command = worddoc_converter command = command + " -d " + tempfile.tempdir + " -b " + os.path.join( tempfile.tempdir, tempbase) command = command + " " + tempdoc + " > " + temphtml try: if os.system(command): message(0, "Error running Word converter %s" % command) return None except: message(0, "Exception running word converter %s" % command) return None # then load the local.html file to data2 try: file = open(temphtml, "rb") data2 = file.read() file.close() except IOError, text: message(0, "Error reading temporary file %s" % temphtml) return None
def WordParser (url, data, headers, config, attributes): """Convert a Word document to HTML and returns a PluckerTextDocument""" # retrieve config information worddoc_converter = config.get_string('worddoc_converter') if worddoc_converter is None: message(0, "Could not find Word conversion command") return None check = os.path.basename (worddoc_converter) (check, ext) = os.path.splitext (check) check = string.lower (check) if check == 'wvware': # need to save data to a local file tempbase = tempfile.mktemp() tempdoc = os.path.join(tempfile.tempdir, tempbase + ".doc") try: file = open (tempdoc, "wb") file.write (data) file.close () except IOError, text: message(0, "Error saving temporary file %s" % tempdoc) return None # then convert it > local.html temphtml = os.path.join(tempfile.tempdir, tempbase + ".html") command = worddoc_converter command = command + " -d " + tempfile.tempdir + " -b " + os.path.join(tempfile.tempdir, tempbase) command = command + " " + tempdoc + " > " + temphtml try: if os.system (command): message(0, "Error running Word converter %s" % command) return None except: message(0, "Exception running word converter %s" % command) return None # then load the local.html file to data2 try: file = open (temphtml, "rb") data2 = file.read () file.close () except IOError, text: message(0, "Error reading temporary file %s" % temphtml) return None
def generic_parser(url, headers, data, config, attributes): try: url = str(url) # convert to string if this is still a Url.ULR type = headers["content-type"] verbosity = config.get_int("verbosity", 1) if type == "unknown/unknown" and attributes.has_key("type"): # note that this type is not an HTTP header, and may not contain parameters type = attributes["type"] if type == "text/html": parser = TextParser.StructuredHTMLParser(url, data, headers, config, attributes) for item in parser.get_unknown(): if unknown_things.has_key(item): unknown_things[item].append(url) else: unknown_things[item] = [url] return parser.get_plucker_doc() # DRS 2004-12-29 # pretend message/rfc822 is really text elif type == "text/plain" or type == "message/rfc822": parser = TextParser.PlainTextParser(url, data, headers, config, attributes) return parser.get_plucker_doc() elif type == "mailto/text": # These are easy to handle, the document does it itself, so no # parsing needed as we generate the document directly return PluckerDocs.PluckerMailtoDocument(url) elif type[:6] == "image/": # this can fail, as some parsers do not recognize all image types... parser = ImageParser.get_default_parser(config) parsed = parser(url, type, data, config, attributes) return parsed.get_plucker_doc() elif type[:18] == "application/msword": return WordParser(url, data, headers, config, attributes) else: message(0, "%s type not yet handled (%s)" % (type, url)) return None except RuntimeError, text: error("Runtime error parsing document %s: %s" % (url, text)) return None
def generic_parser(url, headers, data, config, attributes): try: url = str(url) # convert to string if this is still a Url.ULR type = headers['content-type'] verbosity = config.get_int('verbosity', 1) if type == 'unknown/unknown' and attributes.has_key('type'): # note that this type is not an HTTP header, and may not contain parameters type = attributes['type'] if type == "text/html": parser = TextParser.StructuredHTMLParser(url, data, headers, config, attributes) for item in parser.get_unknown(): if unknown_things.has_key(item): unknown_things[item].append(url) else: unknown_things[item] = [url] return parser.get_plucker_doc() elif type == "text/plain": parser = TextParser.PlainTextParser(url, data, headers, config, attributes) return parser.get_plucker_doc() elif type == "mailto/text": # These are easy to handle, the document does it itself, so no # parsing needed as we generate the document directly return PluckerDocs.PluckerMailtoDocument(url) elif type[:6] == "image/": # this can fail, as some parsers do not recognize all image types... parser = ImageParser.get_default_parser(config) parsed = parser(url, type, data, config, attributes) return parsed.get_plucker_doc() elif type[:18] == "application/msword": return WordParser(url, data, headers, config, attributes) else: message(0, "%s type not yet handled" % type) return None except RuntimeError, text: error("Runtime error parsing document %s: %s" % (url, text)) return None
def _retrieve(self, url, alias_list, post_data): """Really retrieve the url.""" if url.get_protocol() == "plucker": return self._retrieve_plucker(url, alias_list) elif url.get_protocol() == "mailto": # Nothing to fetch really... return ( {"URL": url, "error code": 0, "error text": "OK", "content-type": "mailto/text", "content-length": 0}, "", ) else: # not a plucker:... URL try: real_url = str(url) webdoc = self._urlopener.open(real_url, post_data) if hasattr(webdoc, "retcode"): headers_dict = {"URL": real_url, "error code": webdoc.retcode, "error text": webdoc.retmessage} doc_info = webdoc.info() if doc_info is not None: # This should always be a dict, but some people found None... :-( headers_dict.update(doc_info.dict) return (headers_dict, None) if hasattr(webdoc, "url"): ####################################################################### # Redhat 7.x default Python installation will return # # webdoc.url without a protocol at the beginning # # (e.g. ://www.xyz.com instead of http://www.xyz.com). # # This is due to a bug in RH's /usr/lib/python1.5/urllib.py. # # [email protected] # ####################################################################### ################################################ # On Windows we wan't use # # URL(url).get_protocol to get the protokoll # # urllib.splittype(url) and all other url # # manipuling funktions are too buggy # ################################################ if sys.platform == "win32": from PyPlucker.Url import URL webdoc_protocol = URL(webdoc.url).get_protocol else: (webdoc_protocol, webdoc_rest_of_url) = urllib.splittype(webdoc.url) # check to see we have a valid URL; if not, use one we started with if webdoc_protocol: real_url = webdoc.url headers_dict = {"URL": real_url} doc_info = webdoc.info() message(3, "doc_info is %s", doc_info) if doc_info is not None: # This should always be a dict, but some people found None... :-( headers_dict.update(doc_info.dict) if not headers_dict.has_key("content-type"): message(1, "Guessing type for %s" % url.get_path()) headers_dict["content-type"] = GuessType(url.get_path()) else: ctype, parameters = parse_http_header_value(headers_dict["content-type"]) headers_dict["content-type"] = ctype for parm in parameters: headers_dict[parm[0]] = parm[1] message(3, "headers_dict is %s", headers_dict) # Now get the contents contents = webdoc.read() # Check if encoded contents... if headers_dict.has_key("content-encoding"): encoding = headers_dict["content-encoding"] if encoding == "gzip" and _have_gzip: s = StringIO.StringIO(contents) g = gzip.GzipFile(fileobj=s) c = g.read() g.close() contents = c else: return ( { "URL": real_url, "error code": 404, "error text": "Unhandled content-encoding '%s'" % encoding, }, None, ) except IOError, text: return ({"URL": real_url, "error code": 404, "error text": text}, None) except OSError, text: return ({"URL": real_url, "error code": 404, "error text": text}, None)
def _retrieve (self, url, alias_list, post_data): """Really retrieve the url.""" if url.get_protocol () == 'plucker': return self._retrieve_plucker (url, alias_list) elif url.get_protocol () == 'mailto': # Nothing to fetch really... return ({'URL': url, 'error code': 0, 'error text': "OK", 'content-type': "mailto/text", 'content-length': 0}, "") else: # not a plucker:... URL try: real_url = str (url) webdoc = self._urlopener.open (real_url, post_data) if hasattr (webdoc, 'retcode'): headers_dict = {'URL': real_url, 'error code': webdoc.retcode, 'error text': webdoc.retmessage} doc_info = webdoc.info () if doc_info is not None: # This should always be a dict, but some people found None... :-( headers_dict.update (doc_info.dict) return (headers_dict, None) if hasattr (webdoc, 'url'): ####################################################################### # Redhat 7.x default Python installation will return # # webdoc.url without a protocol at the beginning # # (e.g. ://www.xyz.com instead of http://www.xyz.com). # # This is due to a bug in RH's /usr/lib/python1.5/urllib.py. # # [email protected] # ####################################################################### ################################################ # On Windows we wan't use # # URL(url).get_protocol to get the protokoll # # urllib.splittype(url) and all other url # # manipuling funktions are too buggy # ################################################ if sys.platform == 'win32': from PyPlucker.Url import URL webdoc_protocol = URL(webdoc.url).get_protocol else: (webdoc_protocol, webdoc_rest_of_url) = urllib.splittype(webdoc.url) # check to see we have a valid URL; if not, use one we started with if webdoc_protocol: real_url = webdoc.url headers_dict = {'URL': real_url} doc_info = webdoc.info () message(3, "doc_info is %s", doc_info); if doc_info is not None: # This should always be a dict, but some people found None... :-( headers_dict.update (doc_info.dict) if not headers_dict.has_key ('content-type'): message (1, "Guessing type for %s" % url.get_path ()) headers_dict['content-type'] = GuessType (url.get_path ()) else: ctype, parameters = parse_http_header_value(headers_dict['content-type']) headers_dict['content-type'] = ctype for parm in parameters: headers_dict[parm[0]] = parm[1] message(3, "headers_dict is %s", headers_dict); # Now get the contents contents = webdoc.read () # Check if encoded contents... if headers_dict.has_key ('content-encoding'): encoding = headers_dict['content-encoding'] if encoding == 'gzip' and _have_gzip: s = StringIO.StringIO (contents) g = gzip.GzipFile (fileobj=s) c = g.read () g.close () contents = c else: return ({'URL': real_url, 'error code': 404, 'error text': "Unhandled content-encoding '%s'" % encoding}, None) except IOError, text: return ({'URL': real_url, 'error code': 404, 'error text': text}, None) except OSError, text: return ({'URL': real_url, 'error code': 404, 'error text': text}, None)