Example #1
0
def get_section_scores_by_readability(htmlcontent):

    # module_logger.debug("htmlcontent: {}".format(htmlcontent))

    try:
        doc = Document(htmlcontent)
        d = doc.score_paragraphs()
    except Exception as e:
        raise SnippetGenerationError(
            "failed to process document using readability",
            original_exception=e)

    output_data = {"algorithm": "readability", "scored paragraphs": []}

    scored_elements = []

    module_logger.debug("# of scored paragraphs: {}".format(len(d)))

    for para in d:

        try:
            score = d[para]['content_score']
            text = d[para]['elem'].text_content().replace('\n', ' ').replace(
                '\r', ' ').strip()
            module_logger.debug("looking at text {}".format(text))

            scored_elements.append((score, text))

        except Exception as e:
            raise SnippetGenerationError(
                "failed to process document using readability",
                original_exception=e)

    for element in sorted(scored_elements, reverse=True):

        score = element[0]
        text = element[1]

        output_data["scored paragraphs"].append({"score": score, "text": text})

    return output_data
Example #2
0
def get_best_description(htmlcontent):

    description = None

    try:
        doc = Document(htmlcontent)
        d = doc.score_paragraphs()
    except Exception as e:
        raise SnippetGenerationError(
            "failed to process document using readability",
            original_exception=e)

    maxscore = 0

    maxpara = None

    for para in d:

        try:
            if d[para]['content_score'] > maxscore:
                maxpara = d[para]['elem']
                maxscore = d[para]['content_score']

        except Exception as e:
            raise SnippetGenerationError(
                "failed to process document using readability",
                original_exception=e)

    if maxpara is not None:
        allparatext = maxpara.text_content().replace('\n',
                                                     ' ').replace('\r',
                                                                  ' ').strip()
        description = p.sub(' ', allparatext)
    else:

        try:
            paragraphs = justext(htmlcontent, get_stoplist("English"))
        except Exception as e:
            raise SnippetGenerationError(
                "failed to process document using justext",
                original_exception=e)

        allparatext = ""

        for paragraph in paragraphs:

            try:
                if not paragraph.is_boilerplate:

                    allparatext += " {}".format(paragraph.text)
            except Exception as e:
                raise SnippetGenerationError(
                    "failed to process document using justext",
                    original_exception=e)

        if allparatext == "":

            for paragraph in paragraphs:

                try:
                    allparatext += "{}".format(paragraph.text)
                except Exception as e:
                    raise SnippetGenerationError(
                        "failed to process document using justext",
                        original_exception=e)

        if allparatext != "":
            description = allparatext.strip()
        else:
            # we give up at this point
            description = ""

    return description