def get_section_scores_by_readability(htmlcontent): # module_logger.debug("htmlcontent: {}".format(htmlcontent)) try: doc = Document(htmlcontent) d = doc.score_paragraphs() except Exception as e: raise SnippetGenerationError( "failed to process document using readability", original_exception=e) output_data = {"algorithm": "readability", "scored paragraphs": []} scored_elements = [] module_logger.debug("# of scored paragraphs: {}".format(len(d))) for para in d: try: score = d[para]['content_score'] text = d[para]['elem'].text_content().replace('\n', ' ').replace( '\r', ' ').strip() module_logger.debug("looking at text {}".format(text)) scored_elements.append((score, text)) except Exception as e: raise SnippetGenerationError( "failed to process document using readability", original_exception=e) for element in sorted(scored_elements, reverse=True): score = element[0] text = element[1] output_data["scored paragraphs"].append({"score": score, "text": text}) return output_data
def get_best_description(htmlcontent): description = None try: doc = Document(htmlcontent) d = doc.score_paragraphs() except Exception as e: raise SnippetGenerationError( "failed to process document using readability", original_exception=e) maxscore = 0 maxpara = None for para in d: try: if d[para]['content_score'] > maxscore: maxpara = d[para]['elem'] maxscore = d[para]['content_score'] except Exception as e: raise SnippetGenerationError( "failed to process document using readability", original_exception=e) if maxpara is not None: allparatext = maxpara.text_content().replace('\n', ' ').replace('\r', ' ').strip() description = p.sub(' ', allparatext) else: try: paragraphs = justext(htmlcontent, get_stoplist("English")) except Exception as e: raise SnippetGenerationError( "failed to process document using justext", original_exception=e) allparatext = "" for paragraph in paragraphs: try: if not paragraph.is_boilerplate: allparatext += " {}".format(paragraph.text) except Exception as e: raise SnippetGenerationError( "failed to process document using justext", original_exception=e) if allparatext == "": for paragraph in paragraphs: try: allparatext += "{}".format(paragraph.text) except Exception as e: raise SnippetGenerationError( "failed to process document using justext", original_exception=e) if allparatext != "": description = allparatext.strip() else: # we give up at this point description = "" return description