Ejemplo n.º 1
0
 def hash_id(self):
     content = tounicode(self.node)
     hashed = md5()
     try:
         hashed.update(content.encode('utf-8', errors="replace"))
     except Exception, e:
         LOG.error("BOOM! " + str(e))
Ejemplo n.º 2
0
def generate_hash_id(node):
    """Generate a hash_id for the node in question.

    :param node: lxml etree node

    """
    content = tounicode(node)
    hashed = md5()
    try:
        hashed.update(content.encode('utf-8', "replace"))
    except Exception, e:
        LOG.error("BOOM! " + str(e))
Ejemplo n.º 3
0
def generate_hash_id(node):
    """Generate a hash_id for the node in question.

    :param node: lxml etree node

    """
    content = tounicode(node)
    hashed = md5()
    try:
        hashed.update(content.encode('utf-8', "replace"))
    except Exception as e:
        LOG.error("BOOM! " + str(e))

    return hashed.hexdigest()[0:8]
Ejemplo n.º 4
0
def build_doc(page):
    """Requires that the `page` not be None"""
    if page is None:
        LOG.error("Page content is None, can't build_doc")
        return ''
    if isinstance(page, unicode):
        page_unicode = page
    else:
        enc = get_encoding(page)
        page_unicode = page.decode(enc, 'replace')
    doc = document_fromstring(
        page_unicode.encode('utf-8', 'replace'),
        parser=utf8_parser)
    return doc
Ejemplo n.º 5
0
def build_doc(page):
    """Requires that the `page` not be None"""
    if page is None:
        LOG.error("Page content is None, can't build_doc")
        return ""
    if isinstance(page, unicode):
        page_unicode = page
    else:
        enc = get_encoding(page)
        page_unicode = page.decode(enc, "replace")
    try:
        doc = document_fromstring(page_unicode.encode("utf-8", "replace"), parser=utf8_parser)
        return doc
    except XMLSyntaxError, exc:
        LOG.error("Failed to parse: " + str(exc))
        raise ValueError("Failed to parse document contents.")
Ejemplo n.º 6
0
def build_doc(page):
    """Requires that the `page` not be None"""
    if page is None:
        LOG.error("Page content is None, can't build_doc")
        return ''
    if isinstance(page, str):
        page_unicode = page
    else:
        enc = get_encoding(page)
        page_unicode = page.decode(enc, 'replace')
    try:
        doc = document_fromstring(
            page_unicode.encode('utf-8', 'replace'),
            parser=utf8_parser)
        return doc
    except XMLSyntaxError as exc:
        LOG.error('Failed to parse: ' + str(exc))
        raise ValueError('Failed to parse document contents.')