def hash_id(self): content = tounicode(self.node) hashed = md5() try: hashed.update(content.encode('utf-8', errors="replace")) except Exception, e: LOG.error("BOOM! " + str(e))
def generate_hash_id(node): """Generate a hash_id for the node in question. :param node: lxml etree node """ content = tounicode(node) hashed = md5() try: hashed.update(content.encode('utf-8', "replace")) except Exception, e: LOG.error("BOOM! " + str(e))
def generate_hash_id(node): """Generate a hash_id for the node in question. :param node: lxml etree node """ content = tounicode(node) hashed = md5() try: hashed.update(content.encode('utf-8', "replace")) except Exception as e: LOG.error("BOOM! " + str(e)) return hashed.hexdigest()[0:8]
def build_doc(page): """Requires that the `page` not be None""" if page is None: LOG.error("Page content is None, can't build_doc") return '' if isinstance(page, unicode): page_unicode = page else: enc = get_encoding(page) page_unicode = page.decode(enc, 'replace') doc = document_fromstring( page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) return doc
def build_doc(page): """Requires that the `page` not be None""" if page is None: LOG.error("Page content is None, can't build_doc") return "" if isinstance(page, unicode): page_unicode = page else: enc = get_encoding(page) page_unicode = page.decode(enc, "replace") try: doc = document_fromstring(page_unicode.encode("utf-8", "replace"), parser=utf8_parser) return doc except XMLSyntaxError, exc: LOG.error("Failed to parse: " + str(exc)) raise ValueError("Failed to parse document contents.")
def build_doc(page): """Requires that the `page` not be None""" if page is None: LOG.error("Page content is None, can't build_doc") return '' if isinstance(page, str): page_unicode = page else: enc = get_encoding(page) page_unicode = page.decode(enc, 'replace') try: doc = document_fromstring( page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) return doc except XMLSyntaxError as exc: LOG.error('Failed to parse: ' + str(exc)) raise ValueError('Failed to parse document contents.')