Exemple #1
0
 def start_requests(self):
     missions = session.getMission(sum_mark = 'xw', child_mark = 'ly')
     for m in missions:
         try:
             if urltools.get_domain(m[0]) == 'aoyou':
                 meta = {'mission': m, 'spider': self.name, 'domain': urltools.get_domain(m[0]), 'simulate': True}
             else:
                 meta = {'mission': m, 'spider': self.name, 'domain': urltools.get_domain(m[0])}
             yield Request(url = m[0],
                           meta = meta,
                           callback = self.__getattribute__('parse_%s' % meta['domain']))
         except:
             continue
Exemple #2
0
 def start_requests(self):
     for m in self.start_urls:
         try:
             meta = {"spider": self.name, "domain": urltools.get_domain(m)}
             yield Request(url=m, meta=meta, callback=self.__getattribute__("parse_%s" % meta["domain"]))
         except:
             continue
Exemple #3
0
 def start_requests(self):
     missions = session.getMission(sum_mark = 'dx', child_mark = 'dx')
     for m in missions:
         try:
             meta = {'mission': m, 'spider': self.name}
             yield Request(url = m[0],
                           meta = meta,
                           callback = self.__getattribute__('parse_%s' % urltools.get_domain(m[0])))
         except:
             continue
Exemple #4
0
 def start_requests(self):
     missions = ['http://m.kitco.cn/gold.html','http://m.kitco.cn/silver.html']
     for m in missions:
         try:
             meta = {'spider': self.name,
                     'domain': urltools.get_domain(m),
                     'mission': m}
             yield Request(url = m,
                           meta = meta,
                           callback = self.parse_kitco)
         except:
             continue
Exemple #5
0
 def start_requests(self):
     missions = session.getMission(sum_mark = 'ghb',child_mark = 'gp')
     for m in missions:
         try:
             meta = {'spider': self.name,
                     'domain': urltools.get_domain(m[8]),
                     'mission': m}
             yield Request(url = 'http://hq.sinajs.cn/list=' + m[8],
                           meta = meta,
                           callback = self.parse_sinajs)
         except:
             continue
Exemple #6
0
    def start_requests(self):
        for m in self.missions:
            try:
                meta = {'spider': self.name,
                        'domain': urltools.get_domain(m[0]),
                        'mission': m}

                yield Request(url = m[0],
                              meta = meta,
                              callback = self.__getattribute__('parse_%s' % meta['domain']))
            except:
                continue
Exemple #7
0
 def start_requests(self):
     missions = session.getMission(sum_mark = 'xw', child_mark = 'ss')
     for m in missions:
         try:
             meta = {'spider': self.name,
                     'domain': urltools.get_domain(m[0]),
                     'mission': m}
             if 'chinawatch-clock' in meta['domain']:
                 meta['domain'] = "chinawatch_clock"
             yield Request(url = m[0],
                           meta = meta,
                           callback = self.__getattribute__('parse_%s' % meta['domain']))
         except:
             continue
Exemple #8
0
def run():
    parser = OptionParser()
    parser.add_option("-t", "--tpl", dest="template_name",
                  help="specified a template")
    parser.add_option("-u", "--url", dest="request_url", 
                  help="specified a request url as 'http://www.baidu.com/'")

    (options, args) = parser.parse_args()
    tpl = options.template_name
    url = options.request_url
    if not tpl or not url:
        sys.stderr.write("Type './testunit.py --help' for usage.\n")
        sys.exit(1)

    spider = Crawl(url)
    spider.fetch()

    pm = ParserManager(urltools.get_domain(url))
    p = pm.create(tpl, response = spider.response)
    item = p.extract()
    
    for kv in item.withdict().items():
        print u'[%s]: %s' % kv