Ejemplo n.º 1
0
	def testPipeoutToCrawledItem(self):
		domitems = {
			"name": 'category',
			"url": '/politique/index.1.html', 
			"selector": 'div#mainNav > ul#menu_main > li > a',
			"nested_items":{
				"name": 'article',
				"url": '/politique/212121.html',
				"selector": 'h2.section_title > a'	
			} 
		}

		try:
			from copy import deepcopy
			worm = Worm("http://www.hespress.com/", deepcopy(domitems))
		except Exception as e:
			self.print_failure("Test failed with error: %s" % str(e))
			raise e
		else:
			crawled_items = worm._pipeout(worm.domitems, "")
			worm.domitems.crawled_items = crawled_items
			self.print_success("----------------- Piped out : %s" % worm.domitems.crawled_items)

			self.print_info("----------------- Crawling subitems")
			for i in worm.domitems.crawled_items:
				self.print_success("----------------- Crawling: %s" % i.url)
				self.print_success("----------------- Respective DomItem: %s" % i.dom_item.name)
				worm._pipeout_to_crawled_item(i, 'smart')
				self.print_success("Extracted data:%s\n" % i.nested_items)
Ejemplo n.º 2
0
	def testLaunch(self):
		domitems = {
			"name": 'category',
			"url": '/politique/index.1.html', 
			"selector": 'div#mainNav > ul#menu_main > li > a',
			"nested_items":{
				"name": 'article',
				"url": '/politique/212121.html',
				"selector": 'h2.section_title > a'	
			} 
		}

		try:
			from copy import deepcopy
			worm = Worm("http://www.hespress.com/", deepcopy(domitems))
		except Exception as e:
			self.print_failure("Test failed with error: %s" % str(e))
			raise e
		else:
			worm._launch()
			self.print_info("----------------- Crawling finished.")
			def _pt(crawleditem):
				print("[%s]Crawled item: %s" % (crawleditem.dom_item.name, crawleditem.url))

			for ci in worm.domitems.crawled_items:
				ci.diverge(_pt)
Ejemplo n.º 3
0
    def run(self):
        worm = Worm(self.website.url, self.parse_config())

        try:
            worm._launch("smart", force=True)
        except Exception as e:
            print("Crawled failed with %s" % str(e))
            raise e
            # self.handle_exception(e)
        else:
            self.finalize(worm.jsonify())
Ejemplo n.º 4
0
	def testPipeout(self):
		domitems = {
			"name": "category",
			"url": "/politique/index.1.html", 
			"selector": "div#mainNav > ul#menu_main > li > a", 
		}

		try:
			from copy import deepcopy
			worm = Worm("http://www.hespress.com/", deepcopy(domitems))
		except Exception as e:
			self.print_failure("Test failed with error: %s" % str(e))
			raise e
		else:
			self.print_success("Extracted data:\n %s" % worm._pipeout(worm.domitems, ""))
Ejemplo n.º 5
0
    def finalize(self, summary):
        from newsline.helpers import helpers
        from django.conf import settings

        date = self.format_date()

        dirpath = "%s/data/crawls/%s" % (settings.NEWSWORM_DIR,
                                         self.website.name)
        sumpath = "%s/%s_summary.json" % (dirpath, date)
        bloomfilterpath = "%s/%s_filter.bloom" % (dirpath, date)

        helpers.makedir(dirpath)

        helpers.write_json("%s" % sumpath, summary)

        nsummary = Worm.normalize_summary(summary)

        fresh_links = self.keep_fresh(nsummary)

        print("Fresh %s" % fresh_links)

        if fresh_links:
            self.bloomfilter(fresh_links, bloomfilterpath)
            self.extract_articles(
                self.website.register_crawl(sumpath, bloomfilterpath),
                fresh_links)
Ejemplo n.º 6
0
	def testJsonifyMultipage(self):
		rooturl = "http://www.hespress.com/"
		domitems = {
			"name": 'category',
			"url": '/politique/index.1.html', 
			"selector": 'div#mainNav > ul#menu_main > li > a',
			"nested_items":{
				"name": 'page',
				"url": '/politique/index.2.html',
				"selector": 'div#box_pagination > span.pagination > a',
				"nested_items":{
					"name": 'article',
					"url": '/politique/212121.html',
					"selector": 'h2.section_title > a'
				}	
			} 
		}

		# rooturl = "http://www.goud.ma"
		# domitems = {
		# 	"name": 'category',
		# 	"url": r'http://www.goud.ma/topics/آش-واقع',  
		# 	"selector": 'ul.main-menu > li.menu-item > a',
		# 	"nested_items":{
		# 		"name": 'page',
		# 		"url": r'http://www.goud.ma/topics/%d8%aa%d8%a8%d8%b1%d9%83%d9%8a%d9%83/page/2/',
		# 		"selector": 'div.pagination > a',
		# 		"nested_items":{
		# 			"name": 'article',
		# 			"url": r'http://www.goud.ma/%D8%B5%D8%A7%D9%81%D9%8A%D9%86%D8%A7%D8%B2-%D9%82%D8%B6%D9%8A%D8%A9-%D9%8A%D9%87%D9%88%D8%AF%D9%8A%D8%A9-%D9%87%D8%B1%D8%A8%D8%AA-%D9%85%D9%86-%D8%AA%D8%AC%D8%A7%D8%B1%D8%A9-%D8%A7%D9%84%D8%AC%D9%86-220161/',
		# 			"selector": 'h2 > a'
		# 		}	
		# 	}
		# }

		try:
			from copy import deepcopy
			worm = Worm(rooturl, deepcopy(domitems))
			worm._launch("smart")
		except Exception as e:
			self.print_failure("Test failed with error: %s" % str(e))
			raise e
		else:
			self.print_info("----------------- Crawling finished.")
			self.write_to_jsonfile("testJsonfiyMultipage.json", worm.jsonify())
Ejemplo n.º 7
0
	def testCrawlHyperLinks(self):
		domitems = {
			"name": "category",
			"url": "/politique/index.1.html", 
			"selector": "div#mainNav > ul#menu_main > li > a", 
		}

		try:
			from copy import deepcopy
			worm = Worm("http://www.hespress.com/", deepcopy(domitems))
		except Exception as e:
			self.print_failure("Test failed with error: %s" % str(e))
			raise e
		else:
			self.print_success("Extracted data:\n %s" % worm._crawl_similar_hyperlinks(worm.domitems, ""))

		self.print_seperator()

		domitems = {
			"name": "article",
			"url": "/politique/304866.html", 
			"selector": "h2.section_title > a", 
		}

		try:
			from copy import deepcopy
			worm = Worm("http://www.hespress.com/", deepcopy(domitems))
		except Exception as e:
			self.print_failure("Test failed with error: %s" % str(e))
			raise e
		else:
			self.print_success("Extracted data:\n %s" % worm._crawl_similar_hyperlinks(worm.domitems, "/politique"))
Ejemplo n.º 8
0
	def testJsonify(self):
		domitems = {
			"name": 'category',
			"url": '/politique/index.1.html', 
			"selector": 'div#mainNav > ul#menu_main > li > a',
			"nested_items":{
				"name": 'article',
				"url": '/politique/212121.html',
				"selector": 'h2.section_title > a'	
			} 
		}

		try:
			from copy import deepcopy
			worm = Worm("http://www.hespress.com/", deepcopy(domitems))
		except Exception as e:
			self.print_failure("Test failed with error: %s" % str(e))
			raise e
		else:
			worm._launch()
			self.print_info("----------------- Crawling finished.")
			self.write_to_jsonfile("testJsonify.json", worm.jsonify())
Ejemplo n.º 9
0
	def testIntegralCrawl(self):

		from newsline.helpers import helpers
		from django.conf import settings
		training_data = helpers.parse_json_file(settings.NEWSLINE_DIR +"/apps/web/newsworm/unittests/core/_files/_input/training_data.json")

		for name, website in training_data.items():
			if name != "assdae":
				continue
			
			print("Crawling %s" % name)
			try:
				from copy import deepcopy
				worm = Worm(website["root_url"], deepcopy(website["domitems"]))
			except Exception as e:
				self.print_failure("----------------- Crawling failed for [%s] with errors: %s" % (name, str(e)))
				raise e
			else:
				from requests.exceptions import RequestException
				try:
					worm._launch("smart", force=True)
				except RequestException as e:    # This is the correct syntax
					self.print_failure("-----------------  Crawling halted for . [%s] with :%s" % (name, e))
					summary = worm._summary()
					summary["status"] = e
					helpers.write_json(settings.NEWSLINE_DIR +"/apps/web/newsworm/unittests/core/_files/_input/training_data.json", summary)
				else:
					self.print_info("-----------------  Crawling finished successfully for %s " % name)
					self.write_to_jsonfile(name, worm.jsonify())

					website["status"] = "done"
					from newsline.helpers import helpers
					from django.conf import settings
					helpers.write_json(settings.NEWSLINE_DIR +"/apps/web/newsworm/unittests/core/_files/_input/training_data.json", training_data)

		self.print_success("Done.")
		self.print_seperator()
Ejemplo n.º 10
0
	def testCrawlWithAutoGen(self):
		# rooturl = "http://www.andaluspress.com/"
		# domitems = {
		# 		"name": "category",
		# 		"selector": "div.mynav > ul > li > a",
		# 		"url": "/news/?cat=politique",
		# 		"nested_items": {
		# 			"name": "page",
		# 			"selector": "ul.pagination > li > a",
		# 			"url": "/news/?cat=politique&page=%d",
		# 			"nested_items": {
		# 				"name": "article",
		# 				"selector": "div.cd-resultNw > a",
		# 				"url": "/info-article/?id=933&t=\u0627\u0644\u0645\u0644\u0643-\u0645\u062d\u0645\u062f-\u0627\u0644\u0633\u0627\u062f\u0633-\u064a\u062c\u0631\u064a-\u0628\u0628\u0643\u064a\u0646-\u0645\u0628\u0627\u062d\u062b\u0627\u062a-\u0645\u0639-\u0627\u0644\u0631\u0626\u064a\u0633-\u0634\u064a-\u062c\u064a\u0646-\u0628\u064a\u0646\u063a"
		# 			}
		# 		}
		# 	}

		rooturl = "http://www.hespress.com/"
		domitems = {
			"name": "category",
			"url": "/politique/index.1.html",
			"selector": "div#mainNav > ul#menu_main > li > a",
			"nested_items": {
				"name": "page",
				"selector": "div#box_pagination > span.pagination > a",
				"url": "/politique/index.%d.html",
				"autogen": "True",
				"range": [0, 2],
				"nested_items": {
					"name": "article",
					"selector": "h2.section_title > a",
					"url": "/politique/212121.html"
				},
			},
		}

		# rooturl = "http://telexpresse.com"
		# domitems = {
		# 	"name": "category",
		# 	"url": "category/مجتمع/4/تصنيفات.html",
		# 	"selector": "div.menusprites > ul > li > a",
		# 	"nested_items": {
		# 		"name": "page",
		# 		"selector": "div#box_pagination > span.pagination > a",
		# 		"url": "category/مجتمع/4/%d/تصنيفات.html",
		# 		"autogen": True,
		# 		"range": [0, 5],
		# 		"nested_items": {
		# 			"name": "article",
		# 			"selector": "center > a",
		# 			"url": "تلكسبريس/اخبار سياسية/53045/هافينغتون بوست وفاة زعيم البوليساريو فرصة لوضع حد لأحد أكثر النزاعات سخافة في العالم.html"
		# 		}
		# 	}
		# }

		try:
			worm = Worm(rooturl, domitems)
		except Exception as e:
			self.print_failure("----------------- Crawling failed with errors: %s" % (str(e)))
			raise e
		else:
			from requests.exceptions import RequestException
			try:
				worm._launch("smart", force=True)
			except RequestException as e:    # This is the correct syntax
				self.print_failure("-----------------  Crawling halted for . [%s] with :%s \n %s" % (name, e, worm._summary()))
			else:
				self.print_info("-----------------  Crawling finished successfully for andaluspress ")
				self.write_to_jsonfile("testAutogen", worm.jsonify())
Ejemplo n.º 11
0
	def testJsonNormalization(self):
		from newsline.helpers import helpers
		from django.conf import settings
		summary = helpers.parse_json_file(settings.NEWSLINE_DIR +"/apps/web/newsworm/unittests/core/_files/_output/akhbarona.json")

		print(Worm.normalize(summary))