Example #1
0
    def fetchListPages(self, listtype="html"):
        print "Start to fetch and parse List"
	urls = self.listRule.getListUrls()
        for url in urls:
	    print "Fetching list page: ", url, "charset:", safestr(self.seed["charset"]), "timeout:", safestr(self.seed["timeout"])
            f = Fetch(url, charset = self.seed["charset"], timeout = self.seed["timeout"])
	    if f.isReady():
		doc = f.read()

		if listtype == "html":
		    self.parseListPage(f, doc, url)
		elif listtype == "json":
		    self.parseJsonPage(f, doc, url)

        print "List has finished parsing. It has %s docs." % ansicolor.red(self.__len__())
Example #2
0
    def parseFeed(self):
        print "Start to fetch and parse Feed list"
        seed = self.seed
        f = Fetch(seed.prefixurl, seed.charset, self.seed.timeout);
	if f.isReady():
	    feed = feedparser.parse(f.read())
	    items = feed["entries"]
	    if len(items) > 0:
		for item in items:
		    _item = Item({
			"url" : item["link"],
			"type" : self.seed_type
		    })

		    if self.guid_rule is None:
			self.guid_rule = "url"

		    guid = self.getItemGUID(item)
		    self.items[guid] = _item

        print "List has finished parsing. It has %s docs." % ansicolor.red(self.__len__())