Esempio n. 1
0
 def __init__(self, category=None, *args, **kwargs):
     self.sitekey = "sogou.com"
     self.temp_id = 0
     self.keysparser = PageKeysParser()
     self.itemsfounder = ItemsFounder()
     self.datamgr = DataMgr()
     self.start_urls = ["http://weixin.sogou.com"]
Esempio n. 2
0
File: jd.py Progetto: colenhyt/tuto
    def __init__(self, category=None, *args, **kwargs):
        self.sitekey = "jd.com"
        self.temp_id = 0
        self.datamgr = DataMgr()
        self.start_urls = []
        urls = self.datamgr.geturls(urlkey=self.sitekey)
        links = []
        for uitem in urls:
            links.append(uitem[2])
        links = sorted(links, key=lambda d: d[2])
        self.start_urls.extend(links)

        if (len(self.start_urls) <= 0):
            self.start_urls.append("http://www.jd.com")

        print "初始化新url:", len(self.start_urls)

        self.urlsparser = UrlsParser()
        self.urlkeys = ["http://channel.", "http://list.", "http://item."]
        self.urlkeys = [self.sitekey]
        self.ignorekeys = ["#comments-list", "/adclick", "javascript:"]