Esempio n. 1
0
    def fetchListPages(self, listtype="html"):
        print "Start to fetch and parse List"
	urls = self.listRule.getListUrls()
        for url in urls:
	    print "Fetching list page: ", url, "charset:", safestr(self.seed["charset"]), "timeout:", safestr(self.seed["timeout"])
            f = Fetch(url, charset = self.seed["charset"], timeout = self.seed["timeout"])
	    if f.isReady():
		doc = f.read()

		if listtype == "html":
		    self.parseListPage(f, doc, url)
		elif listtype == "json":
		    self.parseJsonPage(f, doc, url)

        print "List has finished parsing. It has %s docs." % ansicolor.red(self.__len__())
Esempio n. 2
0
 def runTest(self):
     try:
         f = Fetch("http://tga.plu.cn")
         if f.connected:
             f.read()
             print f.getCharset(), f.getCode(), f.isReady()
     except Exception, e:
         print e
Esempio n. 3
0
    def parseFeed(self):
        print "Start to fetch and parse Feed list"
        seed = self.seed
        f = Fetch(seed.prefixurl, seed.charset, self.seed.timeout);
	if f.isReady():
	    feed = feedparser.parse(f.read())
	    items = feed["entries"]
	    if len(items) > 0:
		for item in items:
		    _item = Item({
			"url" : item["link"],
			"type" : self.seed_type
		    })

		    if self.guid_rule is None:
			self.guid_rule = "url"

		    guid = self.getItemGUID(item)
		    self.items[guid] = _item

        print "List has finished parsing. It has %s docs." % ansicolor.red(self.__len__())