def test_english_detection(self): from translation import Translator my_translator = Translator(None) result = scraper.scrape("http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNFY1KzEAhaiZchzd5ulmoY4_4P8kA&url=http://vov.vn/Van-hoa/NSND-Thanh-Hoa-xuc-dong-hat-truoc-benh-nhan/228256.vov") self.assertFalse(result.get('unscrapable')) text_obj = process_resources.extract_clean_content(result['htmlContent']) self.assertFalse(my_translator.is_english(text_obj['content']))
def test_english_translation(self): import config from translation import Translator my_translator = Translator(config) result = scraper.scrape("http://peninsulardigital.com/municipios/comondu/refuerzan-acciones-contra-el-dengue/155929") text_obj = process_resources.extract_clean_content(result['htmlContent']) translation_obj = my_translator.translate_to_english(text_obj['content']) self.assertFalse(translation_obj.get('error'))
def test_link_5(self): # This article can be visited in my browser, but the server # sends back error messages in html comments when scraped. result = scraper.scrape("http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNHf5IPdc5RFjTgsO7TnHq_LW8l0-Q&url=http://www.eltribuno.info/Jujuy/218240-Suspendieron-las-clases-por-un-brote-de-influenza-B-en-pueblos-de-la-Puna-.note.aspx") if result.get('unscrapable'): print result self.assertFalse(result.get('unscrapable')) self.assertTrue(len(process_resources.extract_clean_content(result['htmlContent'])['content']) > 1)
def test_cutoff(self): # This article is being cut off at "Tochter in die Kita bringen" # Goose is at fault. Using beautiful soup instead seems to avoid the # cutoff, however we need a method to determine which method we should # be using. result = scraper.scrape("http://www.haz.de/Hannover/Aus-der-Region/Wennigsen/Nachrichten/Kita-Kind-an-Ehec-erkrankt") self.assertTrue( process_resources.extract_clean_content( result['htmlContent'])['content'] .strip() .endswith("Carsten Fricke"))
def test_link_3(self): result = scraper.scrape("http://www.theargus.co.uk/news/9845086.Screening_follows_new_cases_of_TB_reported_in_Sussex/") print process_resources.extract_clean_content(result['htmlContent']) self.assertFalse(result.get('unscrapable'))