def start_requests(self): missions = session.getMission(sum_mark = 'xw', child_mark = 'ly') for m in missions: try: if urltools.get_domain(m[0]) == 'aoyou': meta = {'mission': m, 'spider': self.name, 'domain': urltools.get_domain(m[0]), 'simulate': True} else: meta = {'mission': m, 'spider': self.name, 'domain': urltools.get_domain(m[0])} yield Request(url = m[0], meta = meta, callback = self.__getattribute__('parse_%s' % meta['domain'])) except: continue
def start_requests(self): for m in self.start_urls: try: meta = {"spider": self.name, "domain": urltools.get_domain(m)} yield Request(url=m, meta=meta, callback=self.__getattribute__("parse_%s" % meta["domain"])) except: continue
def start_requests(self): missions = session.getMission(sum_mark = 'dx', child_mark = 'dx') for m in missions: try: meta = {'mission': m, 'spider': self.name} yield Request(url = m[0], meta = meta, callback = self.__getattribute__('parse_%s' % urltools.get_domain(m[0]))) except: continue
def start_requests(self): missions = ['http://m.kitco.cn/gold.html','http://m.kitco.cn/silver.html'] for m in missions: try: meta = {'spider': self.name, 'domain': urltools.get_domain(m), 'mission': m} yield Request(url = m, meta = meta, callback = self.parse_kitco) except: continue
def start_requests(self): missions = session.getMission(sum_mark = 'ghb',child_mark = 'gp') for m in missions: try: meta = {'spider': self.name, 'domain': urltools.get_domain(m[8]), 'mission': m} yield Request(url = 'http://hq.sinajs.cn/list=' + m[8], meta = meta, callback = self.parse_sinajs) except: continue
def start_requests(self): for m in self.missions: try: meta = {'spider': self.name, 'domain': urltools.get_domain(m[0]), 'mission': m} yield Request(url = m[0], meta = meta, callback = self.__getattribute__('parse_%s' % meta['domain'])) except: continue
def start_requests(self): missions = session.getMission(sum_mark = 'xw', child_mark = 'ss') for m in missions: try: meta = {'spider': self.name, 'domain': urltools.get_domain(m[0]), 'mission': m} if 'chinawatch-clock' in meta['domain']: meta['domain'] = "chinawatch_clock" yield Request(url = m[0], meta = meta, callback = self.__getattribute__('parse_%s' % meta['domain'])) except: continue
def run(): parser = OptionParser() parser.add_option("-t", "--tpl", dest="template_name", help="specified a template") parser.add_option("-u", "--url", dest="request_url", help="specified a request url as 'http://www.baidu.com/'") (options, args) = parser.parse_args() tpl = options.template_name url = options.request_url if not tpl or not url: sys.stderr.write("Type './testunit.py --help' for usage.\n") sys.exit(1) spider = Crawl(url) spider.fetch() pm = ParserManager(urltools.get_domain(url)) p = pm.create(tpl, response = spider.response) item = p.extract() for kv in item.withdict().items(): print u'[%s]: %s' % kv