# Got a <div id="docBody"> tag? if ( bs_div_docBody != None ): # get nested <div> that contains article content. bs_div_mainText = bs_div_docBody.find( "div", NewsBankHelper.HTML_CLASS_MAIN_TEXT ) # print the original HTML if ( DEBUG_FLAG == True ): print( "Original HTML:" ) print( str( bs_div_mainText ) ) #-- END DEBUG --# # clean it up with NewsBankHelper my_newsbank_helper = NewsBankHelper() cleaned_article_body = my_newsbank_helper.clean_article_body( bs_div_mainText ) # print the original HTML if ( DEBUG_FLAG == True ): # output print( "\n\n\nCleaned article body:" ) print( cleaned_article_body ) # retrieve and print the original original_text = article_text.get_content() print( "\n\n\nOriginal content:" ) print( original_text ) # same? if ( cleaned_article_body == original_text ): print( "====> SAME!" )
# load raw content into a BeautifulSoup instance bs = BeautifulSoup( test_raw.content ) # retrieve main content <div> for a NewsBank HTML article. bs_div_docBody = bs.find( "div", NewsBankHelper.HTML_CLASS_DOC_BODY ) # get nested <div> that contains article content. bs_temp_tag = bs_div_docBody.find( "div", NewsBankHelper.HTML_CLASS_MAIN_TEXT ) # print the original HTML print( "Original HTML:" ) print( str( bs_temp_tag ) ) # clean it up with NewsBankHelper my_newsbank_helper = NewsBankHelper() cleaned_article_body = my_newsbank_helper.clean_article_body( bs_temp_tag ) # output print( "\n\n\nCleaned article body:" ) print( cleaned_article_body ) # retrieve Article_Text for this article. test_article_text = Article_Text.objects.get( id = 2 ) # retrieve and print the original original_text = test_article_text.get_content() print( "\n\n\nOriginal content:" ) print( original_text ) # set text test_article_text.set_text( cleaned_article_body )