class iwencaiSpider(scrapy.Spider): name = "wencai" allowed_domains = ["iwencai.com"] start_urls = [ "http://www.iwencai.com/stockpick", ] source_currency = "RMB" picType = "jpg" debug = "" taskId = -1 commonLib = False env_type = "offline" def __init__(self, *args, **kwargs): self.commonLib = Common() self.env_type = self.commonLib.get_env() if self.env_type == "online": self.debug = "" self.commonLib.set_header("env_type",self.env_type) self.commonLib.set_header("debug",self.debug) self.commonLib.write_log("get task id is [%s]" % (self.taskId)) def parse(self, response): try: # request = scrapy.Request(response.url, callback=self.query_indicator_list) # request.meta['method'] = "query_indicator_list" # yield request ## 获取技术指标列表 # for option in response.xpath("//div[@class='area_item']/a[@name='lm_c_jszb']/../div//a[@class='other_link']/@href"): # href = option.extract().strip() # self.commonLib.write_log("get indicator list url is [%s]" % (href)) # request = scrapy.Request(href, callback=self.parse_indicator_list) # yield request # if self.debug: # self.commonLib.write_log("debug") # return # day = "2016年03月23日" # indicator = "MACD金叉" # yield self.query_indicator_by_day(indicator,day) request = scrapy.Request(response.url, callback=self.query_indicator) request.meta['method'] = "query_indicator" yield request except Exception, e: urlStatus = common.STATUS_FAIL exc_type, exc_value, exc_traceback = sys.exc_info() msgStr = self.commonLib.write_exception(exc_type, exc_value, exc_traceback) self.commonLib.write_log(msgStr) print (msgStr)
class zaraSpider(scrapy.Spider): name = "zara" allowed_domains = ["zara.cn"] start_urls = [ #"http://www.zara.cn/cn/zh/%E5%84%BF%E7%AB%A5-c277007.html", "http://www.zara.cn/cn/zh/%E5%84%BF%E7%AB%A5-c359013.html", ] source_currency = "RMB" picType = "jpg" debug = "true" taskId = -1 commonLib = False env_type = "offline" def __init__(self, taskId=None, *args, **kwargs): super(zaraSpider, self).__init__(*args, **kwargs) #self.start_urls = ['http://www.example.com/categories/%s' % category] self.taskId = int(taskId) self.commonLib = Common() self.env_type = self.commonLib.get_env() if self.env_type == "online": self.debug = "" self.commonLib.set_header("env_type",self.env_type) self.commonLib.set_header("debug",self.debug) self.commonLib.write_log("get task id is [%s]" % (self.taskId)) def parse(self, response): # ## 打折 try: expectCnt = 1 actualCnt = 0 top_bar_list = ["男婴","女婴","男童","女童"] urlStatus = common.STATUS_DONE assert self.taskId > 0, "taskId [%s] should not be null" % (self.taskId) ## //li[@rootid]/ul/li/ul/li barList = response.xpath("//li[@class='current selected']/ul/li/a") expectCnt = len(barList) for category in barList: category_url = category.xpath("@href")[0].extract().strip() + "#" + common.LEVEL_HOME top_bar_name = category.xpath("text()")[0].extract().strip() top_bar = top_bar_name.split(" ")[0].strip() actualCnt = actualCnt + 1 # if top_bar not in top_bar_list: # self.commonLib.write_log("top_bar [%s] is not child category" % (top_bar)) # continue self.commonLib.write_log("top_bar_name is [%s], top_bar is [%s],parse zara url is [%s],actualCnt is [%s] " % (top_bar_name,top_bar,category_url,actualCnt)) product_info = {} product_info['top_bar'] = top_bar request = scrapy.Request(category_url, callback=self.parse_category_list) request.meta['product_info'] = copy.deepcopy(product_info) yield request assert actualCnt == expectCnt and expectCnt>0, "parse ActualCnt [%s] is not [equal] expectCnt [%s]" % (actualCnt,expectCnt) except Exception, e: urlStatus = common.STATUS_FAIL exc_type, exc_value, exc_traceback = sys.exc_info() msgStr = self.commonLib.write_exception(exc_type, exc_value, exc_traceback) self.commonLib.write_log(msgStr) yield common.addLog(msgStr,self.taskId,common.LOG_FATAL,response.url,self.name) finally: