def __init__(self, category=None, *args, **kwargs): self.sitekey = "sogou.com" self.temp_id = 0 self.keysparser = PageKeysParser() self.itemsfounder = ItemsFounder() self.datamgr = DataMgr() self.start_urls = ["http://weixin.sogou.com"]
def __init__(self, category=None, *args, **kwargs): self.sitekey = "jd.com" self.temp_id = 0 self.datamgr = DataMgr() self.start_urls = [] urls = self.datamgr.geturls(urlkey=self.sitekey) links = [] for uitem in urls: links.append(uitem[2]) links = sorted(links, key=lambda d: d[2]) self.start_urls.extend(links) if (len(self.start_urls) <= 0): self.start_urls.append("http://www.jd.com") print "初始化新url:", len(self.start_urls) self.urlsparser = UrlsParser() self.urlkeys = ["http://channel.", "http://list.", "http://item."] self.urlkeys = [self.sitekey] self.ignorekeys = ["#comments-list", "/adclick", "javascript:"]