def testPipeoutToCrawledItem(self): domitems = { "name": 'category', "url": '/politique/index.1.html', "selector": 'div#mainNav > ul#menu_main > li > a', "nested_items":{ "name": 'article', "url": '/politique/212121.html', "selector": 'h2.section_title > a' } } try: from copy import deepcopy worm = Worm("http://www.hespress.com/", deepcopy(domitems)) except Exception as e: self.print_failure("Test failed with error: %s" % str(e)) raise e else: crawled_items = worm._pipeout(worm.domitems, "") worm.domitems.crawled_items = crawled_items self.print_success("----------------- Piped out : %s" % worm.domitems.crawled_items) self.print_info("----------------- Crawling subitems") for i in worm.domitems.crawled_items: self.print_success("----------------- Crawling: %s" % i.url) self.print_success("----------------- Respective DomItem: %s" % i.dom_item.name) worm._pipeout_to_crawled_item(i, 'smart') self.print_success("Extracted data:%s\n" % i.nested_items)
def testLaunch(self): domitems = { "name": 'category', "url": '/politique/index.1.html', "selector": 'div#mainNav > ul#menu_main > li > a', "nested_items":{ "name": 'article', "url": '/politique/212121.html', "selector": 'h2.section_title > a' } } try: from copy import deepcopy worm = Worm("http://www.hespress.com/", deepcopy(domitems)) except Exception as e: self.print_failure("Test failed with error: %s" % str(e)) raise e else: worm._launch() self.print_info("----------------- Crawling finished.") def _pt(crawleditem): print("[%s]Crawled item: %s" % (crawleditem.dom_item.name, crawleditem.url)) for ci in worm.domitems.crawled_items: ci.diverge(_pt)
def run(self): worm = Worm(self.website.url, self.parse_config()) try: worm._launch("smart", force=True) except Exception as e: print("Crawled failed with %s" % str(e)) raise e # self.handle_exception(e) else: self.finalize(worm.jsonify())
def testPipeout(self): domitems = { "name": "category", "url": "/politique/index.1.html", "selector": "div#mainNav > ul#menu_main > li > a", } try: from copy import deepcopy worm = Worm("http://www.hespress.com/", deepcopy(domitems)) except Exception as e: self.print_failure("Test failed with error: %s" % str(e)) raise e else: self.print_success("Extracted data:\n %s" % worm._pipeout(worm.domitems, ""))
def finalize(self, summary): from newsline.helpers import helpers from django.conf import settings date = self.format_date() dirpath = "%s/data/crawls/%s" % (settings.NEWSWORM_DIR, self.website.name) sumpath = "%s/%s_summary.json" % (dirpath, date) bloomfilterpath = "%s/%s_filter.bloom" % (dirpath, date) helpers.makedir(dirpath) helpers.write_json("%s" % sumpath, summary) nsummary = Worm.normalize_summary(summary) fresh_links = self.keep_fresh(nsummary) print("Fresh %s" % fresh_links) if fresh_links: self.bloomfilter(fresh_links, bloomfilterpath) self.extract_articles( self.website.register_crawl(sumpath, bloomfilterpath), fresh_links)
def testJsonifyMultipage(self): rooturl = "http://www.hespress.com/" domitems = { "name": 'category', "url": '/politique/index.1.html', "selector": 'div#mainNav > ul#menu_main > li > a', "nested_items":{ "name": 'page', "url": '/politique/index.2.html', "selector": 'div#box_pagination > span.pagination > a', "nested_items":{ "name": 'article', "url": '/politique/212121.html', "selector": 'h2.section_title > a' } } } # rooturl = "http://www.goud.ma" # domitems = { # "name": 'category', # "url": r'http://www.goud.ma/topics/آش-واقع', # "selector": 'ul.main-menu > li.menu-item > a', # "nested_items":{ # "name": 'page', # "url": r'http://www.goud.ma/topics/%d8%aa%d8%a8%d8%b1%d9%83%d9%8a%d9%83/page/2/', # "selector": 'div.pagination > a', # "nested_items":{ # "name": 'article', # "url": r'http://www.goud.ma/%D8%B5%D8%A7%D9%81%D9%8A%D9%86%D8%A7%D8%B2-%D9%82%D8%B6%D9%8A%D8%A9-%D9%8A%D9%87%D9%88%D8%AF%D9%8A%D8%A9-%D9%87%D8%B1%D8%A8%D8%AA-%D9%85%D9%86-%D8%AA%D8%AC%D8%A7%D8%B1%D8%A9-%D8%A7%D9%84%D8%AC%D9%86-220161/', # "selector": 'h2 > a' # } # } # } try: from copy import deepcopy worm = Worm(rooturl, deepcopy(domitems)) worm._launch("smart") except Exception as e: self.print_failure("Test failed with error: %s" % str(e)) raise e else: self.print_info("----------------- Crawling finished.") self.write_to_jsonfile("testJsonfiyMultipage.json", worm.jsonify())
def testCrawlHyperLinks(self): domitems = { "name": "category", "url": "/politique/index.1.html", "selector": "div#mainNav > ul#menu_main > li > a", } try: from copy import deepcopy worm = Worm("http://www.hespress.com/", deepcopy(domitems)) except Exception as e: self.print_failure("Test failed with error: %s" % str(e)) raise e else: self.print_success("Extracted data:\n %s" % worm._crawl_similar_hyperlinks(worm.domitems, "")) self.print_seperator() domitems = { "name": "article", "url": "/politique/304866.html", "selector": "h2.section_title > a", } try: from copy import deepcopy worm = Worm("http://www.hespress.com/", deepcopy(domitems)) except Exception as e: self.print_failure("Test failed with error: %s" % str(e)) raise e else: self.print_success("Extracted data:\n %s" % worm._crawl_similar_hyperlinks(worm.domitems, "/politique"))
def testJsonify(self): domitems = { "name": 'category', "url": '/politique/index.1.html', "selector": 'div#mainNav > ul#menu_main > li > a', "nested_items":{ "name": 'article', "url": '/politique/212121.html', "selector": 'h2.section_title > a' } } try: from copy import deepcopy worm = Worm("http://www.hespress.com/", deepcopy(domitems)) except Exception as e: self.print_failure("Test failed with error: %s" % str(e)) raise e else: worm._launch() self.print_info("----------------- Crawling finished.") self.write_to_jsonfile("testJsonify.json", worm.jsonify())
def testIntegralCrawl(self): from newsline.helpers import helpers from django.conf import settings training_data = helpers.parse_json_file(settings.NEWSLINE_DIR +"/apps/web/newsworm/unittests/core/_files/_input/training_data.json") for name, website in training_data.items(): if name != "assdae": continue print("Crawling %s" % name) try: from copy import deepcopy worm = Worm(website["root_url"], deepcopy(website["domitems"])) except Exception as e: self.print_failure("----------------- Crawling failed for [%s] with errors: %s" % (name, str(e))) raise e else: from requests.exceptions import RequestException try: worm._launch("smart", force=True) except RequestException as e: # This is the correct syntax self.print_failure("----------------- Crawling halted for . [%s] with :%s" % (name, e)) summary = worm._summary() summary["status"] = e helpers.write_json(settings.NEWSLINE_DIR +"/apps/web/newsworm/unittests/core/_files/_input/training_data.json", summary) else: self.print_info("----------------- Crawling finished successfully for %s " % name) self.write_to_jsonfile(name, worm.jsonify()) website["status"] = "done" from newsline.helpers import helpers from django.conf import settings helpers.write_json(settings.NEWSLINE_DIR +"/apps/web/newsworm/unittests/core/_files/_input/training_data.json", training_data) self.print_success("Done.") self.print_seperator()
def testCrawlWithAutoGen(self): # rooturl = "http://www.andaluspress.com/" # domitems = { # "name": "category", # "selector": "div.mynav > ul > li > a", # "url": "/news/?cat=politique", # "nested_items": { # "name": "page", # "selector": "ul.pagination > li > a", # "url": "/news/?cat=politique&page=%d", # "nested_items": { # "name": "article", # "selector": "div.cd-resultNw > a", # "url": "/info-article/?id=933&t=\u0627\u0644\u0645\u0644\u0643-\u0645\u062d\u0645\u062f-\u0627\u0644\u0633\u0627\u062f\u0633-\u064a\u062c\u0631\u064a-\u0628\u0628\u0643\u064a\u0646-\u0645\u0628\u0627\u062d\u062b\u0627\u062a-\u0645\u0639-\u0627\u0644\u0631\u0626\u064a\u0633-\u0634\u064a-\u062c\u064a\u0646-\u0628\u064a\u0646\u063a" # } # } # } rooturl = "http://www.hespress.com/" domitems = { "name": "category", "url": "/politique/index.1.html", "selector": "div#mainNav > ul#menu_main > li > a", "nested_items": { "name": "page", "selector": "div#box_pagination > span.pagination > a", "url": "/politique/index.%d.html", "autogen": "True", "range": [0, 2], "nested_items": { "name": "article", "selector": "h2.section_title > a", "url": "/politique/212121.html" }, }, } # rooturl = "http://telexpresse.com" # domitems = { # "name": "category", # "url": "category/مجتمع/4/تصنيفات.html", # "selector": "div.menusprites > ul > li > a", # "nested_items": { # "name": "page", # "selector": "div#box_pagination > span.pagination > a", # "url": "category/مجتمع/4/%d/تصنيفات.html", # "autogen": True, # "range": [0, 5], # "nested_items": { # "name": "article", # "selector": "center > a", # "url": "تلكسبريس/اخبار سياسية/53045/هافينغتون بوست وفاة زعيم البوليساريو فرصة لوضع حد لأحد أكثر النزاعات سخافة في العالم.html" # } # } # } try: worm = Worm(rooturl, domitems) except Exception as e: self.print_failure("----------------- Crawling failed with errors: %s" % (str(e))) raise e else: from requests.exceptions import RequestException try: worm._launch("smart", force=True) except RequestException as e: # This is the correct syntax self.print_failure("----------------- Crawling halted for . [%s] with :%s \n %s" % (name, e, worm._summary())) else: self.print_info("----------------- Crawling finished successfully for andaluspress ") self.write_to_jsonfile("testAutogen", worm.jsonify())
def testJsonNormalization(self): from newsline.helpers import helpers from django.conf import settings summary = helpers.parse_json_file(settings.NEWSLINE_DIR +"/apps/web/newsworm/unittests/core/_files/_output/akhbarona.json") print(Worm.normalize(summary))