コード例 #1
0
ファイル: worm.py プロジェクト: derrandz/python-news-crawler
	def validate(self, domitems):
		if helpers.is_list(domitems):
			if not all(helpers.is_dict(di) for di in domitems):
				raise Exception("The domitems list expects all elements to be dictionaries, some aren't")
			else:
				return domitems
		else:
			if not helpers.is_dict(domitems):
				raise Exception("The domitems expects a dictionary element, %s given" % type(domitems))
			else:
				return domitems
コード例 #2
0
ファイル: worm.py プロジェクト: derrandz/python-news-crawler
	def domitems(self, domitems):
		if helpers.is_list(domitems):
			if not all(helpers.is_dict(domitem) for domitem in domitems):
				raise Exception("The domitems list expects all elements to be dictionaries, some aren't")
			else:
				for i in domitems:
					if 'autogen' in i:
						self._domitems = WDIAutoGen(i['name'], i['url'], i['nested_items'] if 'nested_items' in i else None, i['autogen'], i['range'] if 'range' in i else None, i['parentless'] if 'parentless' in i else False)
					else:
						self._domitems = WDomItem(i['name'], i['url'], i['selector'], i['nested_items'] if 'nested_items' in i else None)
		elif helpers.is_dict(domitems):
			if 'autogen' in domitems:
				self._domitems = WDIAutoGen(domitems['name'], domitems['url'], domitems['nested_items'] if 'nested_items' in domitems else None, domitems['autogen'], domitems['range'] if 'range' in domitems else None, domitems['parentless'] if 'parentless' in domitems else False)
			else:
				self._domitems = WDomItem(domitems['name'], domitems['url'], domitems['selector'], domitems['nested_items'] if 'nested_items' in domitems else None)
コード例 #3
0
ファイル: worm.py プロジェクト: derrandz/python-news-crawler
	def nested_items(self, ni):
		from newsline.helpers import helpers

		# The initialization case
		if ni is None:
			self._nested_items = []
			return 

		if not hasattr(self, "_nested_items"): self._nested_items = []

		if isinstance(ni, DomItem): self._nested_items.append(ni)
		elif helpers.is_dict(ni):
			try:
				if 'autogen' in ni:
					self._nested_items.append(WDIAutoGen(ni['name'], ni['url'], ni['nested_items'] if 'nested_items' in ni else None, ni['autogen'], ni['range'] if 'range' in ni else None, ni['parentless'] if 'parentless' in ni else False))
				else:
					self._nested_items.append(DomItem(ni['name'], ni['url'], ni['selector'], ni['nested_items']) if 'nested_items' in ni else DomItem(ni['name'], ni['url'], ni['selector']))
			except Exception as e:
				raise Exception("DomItem nested element exception : %s" % str(e))

		elif helpers.is_list(ni):
			if helpers.is_empty(ni): raise Exception("You cannot supply nested_items as empty")
			elif all(isinstance(i, DomItem) or isinstance(i, dict) for i in ni):
				try:
					self._nested_items.extend([(WDIAutoGen(i['name'], i['url'], i['nested_items'] if 'nested_items' in i else None, i['autogen'], i['range'] if 'range' in i else None, i['parentless'] if 'parentless' in i else False) if 'autogen' in i else DomItem(i['name'], i['url'], i['selector'], i['nested_items'] if 'nested_items' in i else None )) if isinstance(i, dict) else i for i in ni]) 
				except Exception as e:
					raise Exception("DomItem nested element exception : %s" % str(e))
コード例 #4
0
ファイル: worm.py プロジェクト: derrandz/python-news-crawler
	def clean(self, domitems):
		""" cleans the urls from the double slashes or trailing slashes"""
		if helpers.is_str(domitems): return self.regexr.remove_double_slash(domitems)
		elif helpers.is_dict(domitems):
			return helpers.map_dictionary(self.regexr.remove_double_slash, domitems, "url")
		elif helpers.is_list(domitems):
			def _mpdictpart(_didict, _func=self.regexr.remove_double_slash, _key="url"):
				return helpers.map_dictionary(func=_func, dictionary=_didict, key=_key)

			return list(map(_mpdictpart, domitems))
コード例 #5
0
ファイル: worm.py プロジェクト: derrandz/python-news-crawler
	def decode(self, domitems):
		""" turns the utf-8/ISO-8859-I arabic characters to unicode arabic characters"""
		if helpers.is_str(domitems): return self.regexr.parse_arabic_urls(domitems)
		elif helpers.is_dict(domitems):
			return helpers.map_dictionary(self.regexr.parse_arabic_urls, domitems, "url")
		elif helpers.is_list(domitems):
			def _mpdictpart(_didict, _func=self.regexr.parse_arabic_urls, _key="url"):
				return helpers.map_dictionary(func=_func, dictionary=_didict, key=_key)

			return list(map(_mpdictpart, domitems))
コード例 #6
0
ファイル: worm.py プロジェクト: derrandz/python-news-crawler
	def normalize(self, domitems):
		""" removes the rooturl from the domitem urls if they have it"""
		if helpers.is_str(domitems): return self.remove_rooturl(domitems)
		elif helpers.is_dict(domitems):
			return helpers.map_dictionary(self.remove_rooturl, domitems, "url")
		elif helpers.is_list(domitems):
			def _mpdictpart(_didict, _func=self.remove_rooturl, _key="url"):
				return helpers.map_dictionary(func=_func, dictionary=_didict, key=_key)

			return list(map(_mpdictpart, domitems))
コード例 #7
0
    def realCaseTest(self):
        raised = False
        domitem = None
        try:
            domitem = DomItem(
                'category_item', '/category/politics', 'nav > ul > li > a', {
                    "name": 'pagination',
                    "url": '/category/politics/page1',
                    "selector": 'div.pagination > ul > li > a',
                    "nested_items": {
                        "name": 'articles',
                        "url": '/article/123123.html',
                        "selector": 'h2 > a'
                    }
                })
        except Exception as e:
            self.print_failure("Test failed with :%s" % str(e))
            self.print_seperator()
            return

        self.print_success("Dom Item instantiation successful")

        self.print_with_color("DARKCYAN", "DomItem name: %s" % domitem.name)
        self.print_with_color("DARKCYAN", "DomItem url: %s" % domitem.url)
        self.print_with_color("DARKCYAN",
                              "DomItem selector: %s" % domitem.domselector)
        self.print_with_color(
            "DARKCYAN",
            "DomItem has_nested_items: %s" % domitem.has_nested_items)

        if domitem.has_nested_items:
            self.print_success("\tDom Item has nested items")
            from newsline.helpers import helpers
            if helpers.is_list(domitem.nested_items):
                self.print_with_color("DARKCYAN", "\tNested DomItems are many")
            else:
                nitem = domitem.nested_items
                self.print_with_color("DARKCYAN",
                                      "\tNested DomItem name: %s" % nitem.name)
                self.print_with_color("DARKCYAN",
                                      "\tNested DomItem url: %s" % nitem.url)
                self.print_with_color(
                    "DARKCYAN",
                    "\tNested DomItem selector: %s" % nitem.domselector)
                self.print_with_color(
                    "DARKCYAN", "\tNested DomItem has_nested_items: %s" %
                    nitem.has_nested_items)

                if nitem.has_nested_items:
                    self.print_success("\t\tNested Dom Item has nested items")
                    from newsline.helpers import helpers
                    if helpers.is_dict(nitem.nested_items):
                        self.print_with_color(
                            "DARKCYAN",
                            "\tNested DomItems nested items are many")
                    else:
                        nnitem = nitem.nested_items
                        self.print_with_color(
                            "DARKCYAN",
                            "\t\tNested DomItem name: %s" % nnitem.name)
                        self.print_with_color(
                            "DARKCYAN",
                            "\t\tNested DomItem url: %s" % nnitem.url)
                        self.print_with_color(
                            "DARKCYAN", "\t\tNested DomItem selector: %s" %
                            nnitem.domselector)
                        self.print_with_color(
                            "DARKCYAN",
                            "\t\tNested DomItem has_nested_items: %s" %
                            nnitem.has_nested_items)

                self.print_success("Test passed successfully")
                self.print_seperator()