Esempio n. 1
def get_article(doc, options):
        ruthless = True
        while True:
            for i in tags(doc, 'script', 'style'):
            for i in tags(doc, 'body'):
                i.set('id', 'readabilityBody')
            if ruthless:
            candidates = score_paragraphs(doc, options)

            best_candidate = select_best_candidate(candidates)
            if best_candidate:
                confidence = best_candidate['content_score']
                article = get_raw_article(candidates, best_candidate)
                if ruthless:
                    logging.debug("ruthless removal did not work. ")
                    ruthless = False
                        "ended up stripping too much - going for a safer parse"
                    # try again
                        "Ruthless and lenient parsing did not work. Returning raw html"
                    return Summary(0, None)

            unicode_cleaned_article = sanitize(article, candidates, options)
            cleaned_doc = fragment_fromstring(unicode_cleaned_article)
            cleaned_article = tostring(cleaned_doc)

            of_acceptable_length = len(cleaned_article
                                       or '') >= options['retry_length']
            if ruthless and not of_acceptable_length:
                ruthless = False
                continue  # try again
                return Summary(confidence, cleaned_article)
    except StandardError as e:
        #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
        logging.exception('error getting summary: ')
        raise Unparseable(str(e)), None, sys.exc_info()[2]
Esempio n. 2
def transform_misused_divs_into_paragraphs(doc):
    for elem in tags(doc, 'div'):
        # transform <div>s that do not contain other block elements into <p>s
        if not REGEXES['divToPElementsRe'].search(
                unicode(''.join(map(tostring, list(elem))))):
            logging.debug("Altering %s to p" % (describe(elem)))
            elem.tag = "p"
Esempio n. 3
def find_next_page_url(parsed_urls, url, elem):
    links = tags(elem, 'a')
    base_url = find_base_url(url)
    # candidates is a mapping from URLs to NextPageCandidate objects that
    # represent information used to determine if a URL points to the next page
    # in the article.
    candidates = {}
    for link in links:
        logging.debug('link: %s' % tostring(link))
    top_candidate = None
    for url, candidate in candidates.items():
        score = candidate.score
        logging.debug('next page score of %s: %s' % (url, candidate.score))
        if 50 <= score and (not top_candidate or top_candidate.score < score):
            top_candidate = candidate

    if top_candidate:
        logging.debug('next page link found: %s' % top_candidate.href)
        return top_candidate.href
        return None
Esempio n. 4
def get_article(doc, options):
        ruthless = True
        while True:
            for i in tags(doc, 'script', 'style'):
            for i in tags(doc, 'body'):
                i.set('id', 'readabilityBody')
            if ruthless: 
            candidates = score_paragraphs(doc, options)
            best_candidate = select_best_candidate(candidates)
            if best_candidate:
                confidence = best_candidate['content_score']
                article = get_raw_article(candidates, best_candidate)
                if ruthless:
                    logging.debug("ruthless removal did not work. ")
                    ruthless = False
                    logging.debug("ended up stripping too much - going for a safer parse")
                    # try again
                    logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
                    return Summary(0, None)

            unicode_cleaned_article = sanitize(article, candidates, options)
            cleaned_doc = fragment_fromstring(unicode_cleaned_article)
            cleaned_article = tounicode(cleaned_doc)

            of_acceptable_length = len(cleaned_article or '') >= options['retry_length']
            if ruthless and not of_acceptable_length:
                ruthless = False
                continue # try again
                return Summary(confidence, cleaned_article)
    except StandardError as e:
        #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
        logging.exception('error getting summary: ' )
        raise Unparseable(str(e)), None, sys.exc_info()[2]
Esempio n. 5
def transform_double_breaks_into_paragraphs(doc):
    Modifies doc so that double-breaks (<br><br>) in content delineate
    paragraphs.  Some pages use double-breaks when they really should be using

            Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent
            in justo sapien, a consectetur est. Aliquam iaculis, augue eu
            euismod gravida, nisl nisl posuere odio, at euismod metus enim quis


            Praesent posuere tortor at nunc iaculis eget suscipit tellus
            tempus.  Nulla facilisi. Quisque rutrum, ante eu sollicitudin
            congue, dui sapien egestas arcu, in consequat nisl metus eu sem.


            Nam mi sem, lobortis eget adipiscing vitae, ultricies sit amet
            justo.  Nullam rutrum sodales magna vel vestibulum. Curabitur sit
            amet urna purus, ac aliquet sem.

    This routine would transform this into:

            Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent
            in justo sapien, a consectetur est. Aliquam iaculis, augue eu
            euismod gravida, nisl nisl posuere odio, at euismod metus enim quis

            Praesent posuere tortor at nunc iaculis eget suscipit tellus
            tempus.  Nulla facilisi. Quisque rutrum, ante eu sollicitudin
            congue, dui sapien egestas arcu, in consequat nisl metus eu sem.

            Nam mi sem, lobortis eget adipiscing vitae, ultricies sit amet
            justo.  Nullam rutrum sodales magna vel vestibulum. Curabitur sit
            amet urna purus, ac aliquet sem.
    for div in tags(doc, 'div'):
Esempio n. 6
def score_paragraphs(doc, options):
    candidates = {}
    #logging.debug(str([describe(node) for node in tags(doc, "div")]))

    ordered = []
    for elem in tags(doc, "p", "pre", "td"):
        logging.debug('Scoring %s' % describe(elem))
        parent_node = elem.getparent()
        if parent_node is None:
        grand_parent_node = parent_node.getparent()

        inner_text = clean(elem.text_content() or "")
        inner_text_len = len(inner_text)

        # If this paragraph is less than 25 characters, don't even count it.
        if inner_text_len < options['min_text_len']:

        if parent_node not in candidates:
            candidates[parent_node] = score_node(parent_node)

        if grand_parent_node is not None and grand_parent_node not in candidates:
            candidates[grand_parent_node] = score_node(grand_parent_node)

        content_score = 1
        content_score += len(inner_text.split(','))
        content_score += min((inner_text_len / 100), 3)
        #if elem not in candidates:
        #    candidates[elem] = score_node(elem)

        #WTF? candidates[elem]['content_score'] += content_score
        candidates[parent_node]['content_score'] += content_score
        if grand_parent_node is not None:
                'content_score'] += content_score / 2.0

    # Scale the final candidates score based on link density. Good content should have a
    # relatively small link density (5% or less) and be mostly unaffected by this operation.
    for elem in ordered:
        candidate = candidates[elem]
        ld = get_link_density(elem)
        score = candidate['content_score']
        logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" %
                      (score, describe(elem), ld, score * (1 - ld)))
        candidate['content_score'] *= (1 - ld)

    return candidates
Esempio n. 7
def transform_double_breaks_into_paragraphs(doc):
    Modifies doc so that double-breaks (<br><br>) in content delineate
    paragraphs.  Some pages use double-breaks when they really should be using

            Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent
            in justo sapien, a consectetur est. Aliquam iaculis, augue eu
            euismod gravida, nisl nisl posuere odio, at euismod metus enim quis


            Praesent posuere tortor at nunc iaculis eget suscipit tellus
            tempus.  Nulla facilisi. Quisque rutrum, ante eu sollicitudin
            congue, dui sapien egestas arcu, in consequat nisl metus eu sem.


            Nam mi sem, lobortis eget adipiscing vitae, ultricies sit amet
            justo.  Nullam rutrum sodales magna vel vestibulum. Curabitur sit
            amet urna purus, ac aliquet sem.

    This routine would transform this into:

            Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent
            in justo sapien, a consectetur est. Aliquam iaculis, augue eu
            euismod gravida, nisl nisl posuere odio, at euismod metus enim quis

            Praesent posuere tortor at nunc iaculis eget suscipit tellus
            tempus.  Nulla facilisi. Quisque rutrum, ante eu sollicitudin
            congue, dui sapien egestas arcu, in consequat nisl metus eu sem.

            Nam mi sem, lobortis eget adipiscing vitae, ultricies sit amet
            justo.  Nullam rutrum sodales magna vel vestibulum. Curabitur sit
            amet urna purus, ac aliquet sem.
    for div in tags(doc, 'div'):
Esempio n. 8
def score_paragraphs(doc, options):
    candidates = {}
    #logging.debug(str([describe(node) for node in tags(doc, "div")]))

    ordered = []
    for elem in tags(doc, "p", "pre", "td"):
        logging.debug('Scoring %s' % describe(elem))
        parent_node = elem.getparent()
        if parent_node is None:
        grand_parent_node = parent_node.getparent()

        inner_text = clean(elem.text_content() or "")
        inner_text_len = len(inner_text)

        # If this paragraph is less than 25 characters, don't even count it.
        if inner_text_len < options['min_text_len']:

        if parent_node not in candidates:
            candidates[parent_node] = score_node(parent_node)
        if grand_parent_node is not None and grand_parent_node not in candidates:
            candidates[grand_parent_node] = score_node(grand_parent_node)

        content_score = 1
        content_score += len(inner_text.split(','))
        content_score += min((inner_text_len / 100), 3)
        #if elem not in candidates:
        #    candidates[elem] = score_node(elem)
        #WTF? candidates[elem]['content_score'] += content_score
        candidates[parent_node]['content_score'] += content_score
        if grand_parent_node is not None:
            candidates[grand_parent_node]['content_score'] += content_score / 2.0

    # Scale the final candidates score based on link density. Good content should have a
    # relatively small link density (5% or less) and be mostly unaffected by this operation.
    for elem in ordered:
        candidate = candidates[elem]
        ld = get_link_density(elem)
        score = candidate['content_score']
        logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
        candidate['content_score'] *= (1 - ld)

    return candidates
Esempio n. 9
def find_next_page_url(parsed_urls, url, elem):
    links = tags(elem, 'a')
    base_url = find_base_url(url)
    # candidates is a mapping from URLs to NextPageCandidate objects that
    # represent information used to determine if a URL points to the next page
    # in the article.
    candidates = {}
    for link in links:
        logging.debug('link: %s' % tostring(link))
        eval_possible_next_page_link(parsed_urls, url, base_url, candidates,
    top_candidate = None
    for url, candidate in candidates.items():
        score = candidate.score
        logging.debug('next page score of %s: %s' % (url, candidate.score))
        if 50 <= score and (not top_candidate or top_candidate.score < score):
            top_candidate = candidate

    if top_candidate:
        logging.debug('next page link found: %s' % top_candidate.href)
        return top_candidate.href
        return None
Esempio n. 10
def sanitize(node, candidates, options):
    for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
        if class_weight(header) < 0 or get_link_density(header) > 0.33: 

    for elem in tags(node, "form", "iframe", "textarea"):
    allowed = {}
    # Conditionally clean <table>s, <ul>s, and <div>s
    for el in reverse_tags(node, "table", "ul", "div"):
        if el in allowed:
        weight = class_weight(el)
        if el in candidates:
            content_score = candidates[el]['content_score']
            #print '!',el, '-> %6.3f' % content_score
            content_score = 0
        tag = el.tag

        if weight + content_score < 0:
            logging.debug("Cleaned %s with score %6.3f and weight %-3s" %
                (describe(el), content_score, weight, ))
        elif el.text_content().count(",") < 10:
            counts = {}
            for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
                counts[kind] = len(el.findall('.//%s' %kind))
            counts["li"] -= 100

            content_length = text_length(el) # Count the text length excluding any surrounding whitespace
            link_density = get_link_density(el)
            parent_node = el.getparent()
            if parent_node is not None:
                if parent_node in candidates:
                    content_score = candidates[parent_node]['content_score']
                    content_score = 0
            #if parent_node is not None:
                #pweight = class_weight(parent_node) + content_score
                #pname = describe(parent_node)
                #pweight = 0
                #pname = "no parent"
            to_remove = False
            reason = ""

            #if el.tag == 'div' and counts["img"] >= 1:
            #    continue
            if counts["p"] and counts["img"] > counts["p"]:
                reason = "too many images (%s)" % counts["img"]
                to_remove = True
            elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
                reason = "more <li>s than <p>s"
                to_remove = True
            elif counts["input"] > (counts["p"] / 3):
                reason = "less than 3x <p>s than <input>s"
                to_remove = True
            elif content_length < options['min_text_length'] and (counts["img"] == 0 or counts["img"] > 2):
                reason = "too short content length %s without a single image" % content_length
                to_remove = True
            elif weight < 25 and link_density > 0.2:
                    reason = "too many links %.3f for its weight %s" % (link_density, weight)
                    to_remove = True
            elif weight >= 25 and link_density > 0.5:
                reason = "too many links %.3f for its weight %s" % (link_density, weight)
                to_remove = True
            elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
                reason = "<embed>s with too short content length, or too many <embed>s"
                to_remove = True
            # if el.tag == 'div' and counts['img'] >= 1 and to_remove:
            #     imgs = el.findall('.//img')
            #     valid_img = False
            #     logging.debug(tounicode(el))
            #     for img in imgs:

            #         height = img.get('height')
            #         text_length = img.get('text_length')
            #         logging.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
            #         if to_int(height) >= 100 or to_int(text_length) >= 100:
            #             valid_img = True
            #             logging.debug("valid image" + tounicode(img))
            #             break
            #     if valid_img:
            #         to_remove = False
            #         logging.debug("Allowing %s" %el.text_content())
            #         for desnode in tags(el, "table", "ul", "div"):
            #             allowed[desnode] = True

                #find x non empty preceding and succeeding siblings
                i, j = 0, 0
                x  = 1
                siblings = []
                for sib in el.itersiblings():
                    sib_content_length = text_length(sib)
                    if sib_content_length:
                        i =+ 1
                        if i == x:
                for sib in el.itersiblings(preceding=True):
                    sib_content_length = text_length(sib)
                    if sib_content_length:
                        j =+ 1
                        if j == x:
                if siblings and sum(siblings) > 1000 :
                    to_remove = False
                    logging.debug("Allowing %s" % describe(el))
                    for desnode in tags(el, "table", "ul", "div"):
                        allowed[desnode] = True

            if to_remove:
                logging.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
                    (content_score, describe(el), weight, reason))
                #print tounicode(el)
                #logging.debug("pname %s pweight %.3f" %(pname, pweight))

    # for el in ([node] + [n for n in node.iter()]):
    #     if not (self.options['attributes']):
    #         #el.attrib = {} #FIXME:Checkout the effects of disabling this
    #         pass

    return clean_attributes(tounicode(node))
Esempio n. 11
def transform_misused_divs_into_paragraphs(doc):
    for elem in tags(doc, 'div'):
        # transform <div>s that do not contain other block elements into <p>s
        if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
            logging.debug("Altering %s to p" % (describe(elem)))
            elem.tag = "p"
Esempio n. 12
def sanitize(node, candidates, options):
    for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
        if class_weight(header) < 0 or get_link_density(header) > 0.33:

    for elem in tags(node, "form", "iframe", "textarea"):
    allowed = {}
    # Conditionally clean <table>s, <ul>s, and <div>s
    for el in reverse_tags(node, "table", "ul", "div"):
        if el in allowed:
        weight = class_weight(el)
        if el in candidates:
            content_score = candidates[el]['content_score']
            #print '!',el, '-> %6.3f' % content_score
            content_score = 0
        tag = el.tag

        if weight + content_score < 0:
            logging.debug("Cleaned %s with score %6.3f and weight %-3s" % (
        elif el.text_content().count(",") < 10:
            counts = {}
            for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
                counts[kind] = len(el.findall('.//%s' % kind))
            counts["li"] -= 100

            content_length = text_length(
            )  # Count the text length excluding any surrounding whitespace
            link_density = get_link_density(el)
            parent_node = el.getparent()
            if parent_node is not None:
                if parent_node in candidates:
                    content_score = candidates[parent_node]['content_score']
                    content_score = 0
            #if parent_node is not None:
            #pweight = class_weight(parent_node) + content_score
            #pname = describe(parent_node)
            #pweight = 0
            #pname = "no parent"
            to_remove = False
            reason = ""

            #if el.tag == 'div' and counts["img"] >= 1:
            #    continue
            if counts["p"] and counts["img"] > counts["p"]:
                reason = "too many images (%s)" % counts["img"]
                to_remove = True
            elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
                reason = "more <li>s than <p>s"
                to_remove = True
            elif counts["input"] > (counts["p"] / 3):
                reason = "less than 3x <p>s than <input>s"
                to_remove = True
            elif content_length < options['min_text_length'] and (
                    counts["img"] == 0 or counts["img"] > 2):
                reason = "too short content length %s without a single image" % content_length
                to_remove = True
            elif weight < 25 and link_density > 0.2:
                reason = "too many links %.3f for its weight %s" % (
                    link_density, weight)
                to_remove = True
            elif weight >= 25 and link_density > 0.5:
                reason = "too many links %.3f for its weight %s" % (
                    link_density, weight)
                to_remove = True
            elif (counts["embed"] == 1
                  and content_length < 75) or counts["embed"] > 1:
                reason = "<embed>s with too short content length, or too many <embed>s"
                to_remove = True
                # if el.tag == 'div' and counts['img'] >= 1 and to_remove:
                #     imgs = el.findall('.//img')
                #     valid_img = False
                #     logging.debug(tounicode(el))
                #     for img in imgs:

                #         height = img.get('height')
                #         text_length = img.get('text_length')
                #         logging.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
                #         if to_int(height) >= 100 or to_int(text_length) >= 100:
                #             valid_img = True
                #             logging.debug("valid image" + tounicode(img))
                #             break
                #     if valid_img:
                #         to_remove = False
                #         logging.debug("Allowing %s" %el.text_content())
                #         for desnode in tags(el, "table", "ul", "div"):
                #             allowed[desnode] = True

                #find x non empty preceding and succeeding siblings
                i, j = 0, 0
                x = 1
                siblings = []
                for sib in el.itersiblings():
                    sib_content_length = text_length(sib)
                    if sib_content_length:
                        i = +1
                        if i == x:
                for sib in el.itersiblings(preceding=True):
                    sib_content_length = text_length(sib)
                    if sib_content_length:
                        j = +1
                        if j == x:
                if siblings and sum(siblings) > 1000:
                    to_remove = False
                    logging.debug("Allowing %s" % describe(el))
                    for desnode in tags(el, "table", "ul", "div"):
                        allowed[desnode] = True

            if to_remove:
                    "Cleaned %6.3f %s with weight %s cause it has %s." %
                    (content_score, describe(el), weight, reason))
                #print tounicode(el)
                #logging.debug("pname %s pweight %.3f" %(pname, pweight))

    # for el in ([node] + [n for n in node.iter()]):
    #     if not (self.options['attributes']):
    #         #el.attrib = {} #FIXME:Checkout the effects of disabling this
    #         pass

    return clean_attributes(tounicode(node))