Esempio n. 1
0
         bs_div_docBody = bs.find( "div", NewsBankHelper.HTML_CLASS_DOC_BODY )
         
         # Got a <div id="docBody"> tag?
         if ( bs_div_docBody != None ):
         
             # get nested <div> that contains article content.
             bs_div_mainText = bs_div_docBody.find( "div", NewsBankHelper.HTML_CLASS_MAIN_TEXT )
             
             # print the original HTML
             if ( DEBUG_FLAG == True ):
                 print( "Original HTML:" )
                 print( str( bs_div_mainText ) )
             #-- END DEBUG --#
                         
             # clean it up with NewsBankHelper
             my_newsbank_helper = NewsBankHelper()
             cleaned_article_body = my_newsbank_helper.clean_article_body( bs_div_mainText )
             
             # print the original HTML
             if ( DEBUG_FLAG == True ):
                 # output
                 print( "\n\n\nCleaned article body:" )
                 print( cleaned_article_body )
 
                 # retrieve and print the original
                 original_text = article_text.get_content()
                 print( "\n\n\nOriginal content:" )
                 print( original_text )
                 
                 # same?
                 if ( cleaned_article_body == original_text ):
Esempio n. 2
0
# load raw content into a BeautifulSoup instance
bs = BeautifulSoup( test_raw.content )

# retrieve main content <div> for a NewsBank HTML article.
bs_div_docBody = bs.find( "div", NewsBankHelper.HTML_CLASS_DOC_BODY )

# get nested <div> that contains article content.
bs_temp_tag = bs_div_docBody.find( "div", NewsBankHelper.HTML_CLASS_MAIN_TEXT )

# print the original HTML
print( "Original HTML:" )
print( str( bs_temp_tag ) )

# clean it up with NewsBankHelper
my_newsbank_helper = NewsBankHelper()
cleaned_article_body = my_newsbank_helper.clean_article_body( bs_temp_tag )

# output
print( "\n\n\nCleaned article body:" )
print( cleaned_article_body )

# retrieve Article_Text for this article.
test_article_text = Article_Text.objects.get( id = 2 )

# retrieve and print the original
original_text = test_article_text.get_content()
print( "\n\n\nOriginal content:" )
print( original_text )

# set text