def testSummaryMultipage(self): rooturl = "http://www.hespress.com/" domitems = { "name": 'category', "url": '/politique/index.1.html', "selector": 'div#mainNav > ul#menu_main > li > a', "nested_items":{ "name": 'page', "url": '/politique/index.2.html', "selector": 'div#box_pagination > span.pagination > a', "nested_items":{ "name": 'article', "url": '/politique/212121.html', "selector": 'h2.section_title > a' } } } # rooturl = "http://www.goud.ma" # domitems = { # "name": 'category', # "url": r'http://www.goud.ma/topics/آش-واقع', # "selector": 'ul.main-menu > li.menu-item > a', # "nested_items":{ # "name": 'page', # "url": r'http://www.goud.ma/topics/%d8%aa%d8%a8%d8%b1%d9%83%d9%8a%d9%83/page/2/', # "selector": 'div.pagination > a', # "nested_items":{ # "name": 'article', # "url": r'http://www.goud.ma/%D8%B5%D8%A7%D9%81%D9%8A%D9%86%D8%A7%D8%B2-%D9%82%D8%B6%D9%8A%D8%A9-%D9%8A%D9%87%D9%88%D8%AF%D9%8A%D8%A9-%D9%87%D8%B1%D8%A8%D8%AA-%D9%85%D9%86-%D8%AA%D8%AC%D8%A7%D8%B1%D8%A9-%D8%A7%D9%84%D8%AC%D9%86-220161/', # "selector": 'h2 > a' # } # } # } try: from copy import deepcopy worm = Worm(rooturl, deepcopy(domitems)) worm._launch("smart") except Exception as e: self.print_failure("Test failed with error: %s" % str(e)) raise e else: self.print_info("----------------- Crawling finished.") self.write_to_jsonfile("testSummaryMultipage_jsonfile", worm._summary())
def testSummary(self): domitems = { "name": 'category', "url": '/politique/index.1.html', "selector": 'div#mainNav > ul#menu_main > li > a', "nested_items":{ "name": 'article', "url": '/politique/212121.html', "selector": 'h2.section_title > a' } } try: from copy import deepcopy worm = Worm("http://www.hespress.com/", deepcopy(domitems)) except Exception as e: self.print_failure("Test failed with error: %s" % str(e)) raise e else: worm._launch() self.print_info("----------------- Crawling finished.") self.write_to_jsonfile("testSummary_jsonfile", worm._summary())
def testIntegralCrawl(self): from newsline.helpers import helpers from django.conf import settings training_data = helpers.parse_json_file(settings.NEWSLINE_DIR +"/apps/web/newsworm/unittests/core/_files/_input/training_data.json") for name, website in training_data.items(): if name != "assdae": continue print("Crawling %s" % name) try: from copy import deepcopy worm = Worm(website["root_url"], deepcopy(website["domitems"])) except Exception as e: self.print_failure("----------------- Crawling failed for [%s] with errors: %s" % (name, str(e))) raise e else: from requests.exceptions import RequestException try: worm._launch("smart", force=True) except RequestException as e: # This is the correct syntax self.print_failure("----------------- Crawling halted for . [%s] with :%s" % (name, e)) summary = worm._summary() summary["status"] = e helpers.write_json(settings.NEWSLINE_DIR +"/apps/web/newsworm/unittests/core/_files/_input/training_data.json", summary) else: self.print_info("----------------- Crawling finished successfully for %s " % name) self.write_to_jsonfile(name, worm.jsonify()) website["status"] = "done" from newsline.helpers import helpers from django.conf import settings helpers.write_json(settings.NEWSLINE_DIR +"/apps/web/newsworm/unittests/core/_files/_input/training_data.json", training_data) self.print_success("Done.") self.print_seperator()
def testCrawlWithAutoGen(self): # rooturl = "http://www.andaluspress.com/" # domitems = { # "name": "category", # "selector": "div.mynav > ul > li > a", # "url": "/news/?cat=politique", # "nested_items": { # "name": "page", # "selector": "ul.pagination > li > a", # "url": "/news/?cat=politique&page=%d", # "nested_items": { # "name": "article", # "selector": "div.cd-resultNw > a", # "url": "/info-article/?id=933&t=\u0627\u0644\u0645\u0644\u0643-\u0645\u062d\u0645\u062f-\u0627\u0644\u0633\u0627\u062f\u0633-\u064a\u062c\u0631\u064a-\u0628\u0628\u0643\u064a\u0646-\u0645\u0628\u0627\u062d\u062b\u0627\u062a-\u0645\u0639-\u0627\u0644\u0631\u0626\u064a\u0633-\u0634\u064a-\u062c\u064a\u0646-\u0628\u064a\u0646\u063a" # } # } # } rooturl = "http://www.hespress.com/" domitems = { "name": "category", "url": "/politique/index.1.html", "selector": "div#mainNav > ul#menu_main > li > a", "nested_items": { "name": "page", "selector": "div#box_pagination > span.pagination > a", "url": "/politique/index.%d.html", "autogen": "True", "range": [0, 2], "nested_items": { "name": "article", "selector": "h2.section_title > a", "url": "/politique/212121.html" }, }, } # rooturl = "http://telexpresse.com" # domitems = { # "name": "category", # "url": "category/مجتمع/4/تصنيفات.html", # "selector": "div.menusprites > ul > li > a", # "nested_items": { # "name": "page", # "selector": "div#box_pagination > span.pagination > a", # "url": "category/مجتمع/4/%d/تصنيفات.html", # "autogen": True, # "range": [0, 5], # "nested_items": { # "name": "article", # "selector": "center > a", # "url": "تلكسبريس/اخبار سياسية/53045/هافينغتون بوست وفاة زعيم البوليساريو فرصة لوضع حد لأحد أكثر النزاعات سخافة في العالم.html" # } # } # } try: worm = Worm(rooturl, domitems) except Exception as e: self.print_failure("----------------- Crawling failed with errors: %s" % (str(e))) raise e else: from requests.exceptions import RequestException try: worm._launch("smart", force=True) except RequestException as e: # This is the correct syntax self.print_failure("----------------- Crawling halted for . [%s] with :%s \n %s" % (name, e, worm._summary())) else: self.print_info("----------------- Crawling finished successfully for andaluspress ") self.write_to_jsonfile("testAutogen", worm.jsonify())