Example #1
0
 def test_english_detection(self):
     from translation import Translator
     my_translator = Translator(None)
     result = scraper.scrape("http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNFY1KzEAhaiZchzd5ulmoY4_4P8kA&url=http://vov.vn/Van-hoa/NSND-Thanh-Hoa-xuc-dong-hat-truoc-benh-nhan/228256.vov")
     self.assertFalse(result.get('unscrapable'))
     text_obj = process_resources.extract_clean_content(result['htmlContent'])
     self.assertFalse(my_translator.is_english(text_obj['content']))
Example #2
0
 def test_english_translation(self):
     import config
     from translation import Translator
     my_translator = Translator(config)
     result = scraper.scrape("http://peninsulardigital.com/municipios/comondu/refuerzan-acciones-contra-el-dengue/155929")
     text_obj = process_resources.extract_clean_content(result['htmlContent'])
     translation_obj = my_translator.translate_to_english(text_obj['content'])
     self.assertFalse(translation_obj.get('error'))
Example #3
0
 def test_link_5(self):
     # This article can be visited in my browser, but the server
     # sends back error messages in html comments when scraped.
     result = scraper.scrape("http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNHf5IPdc5RFjTgsO7TnHq_LW8l0-Q&url=http://www.eltribuno.info/Jujuy/218240-Suspendieron-las-clases-por-un-brote-de-influenza-B-en-pueblos-de-la-Puna-.note.aspx")
     if result.get('unscrapable'):
         print result
     self.assertFalse(result.get('unscrapable'))
     self.assertTrue(len(process_resources.extract_clean_content(result['htmlContent'])['content']) > 1)
Example #4
0
 def test_cutoff(self):
     # This article is being cut off at "Tochter in die Kita bringen"
     # Goose is at fault. Using beautiful soup instead seems to avoid the
     # cutoff, however we need a method to determine which method we should
     # be using.
     result = scraper.scrape("http://www.haz.de/Hannover/Aus-der-Region/Wennigsen/Nachrichten/Kita-Kind-an-Ehec-erkrankt")
     self.assertTrue(
         process_resources.extract_clean_content(
             result['htmlContent'])['content']
                 .strip()
                 .endswith("Carsten Fricke"))
Example #5
0
 def test_link_3(self):
     result = scraper.scrape("http://www.theargus.co.uk/news/9845086.Screening_follows_new_cases_of_TB_reported_in_Sussex/")
     print process_resources.extract_clean_content(result['htmlContent'])
     self.assertFalse(result.get('unscrapable'))