def main(**kwargs): sd = kwargs.get('sd', '') ed = kwargs.get('ed', '') interval = kwargs.get('interval', 60) date_list = util.specified_date(sd, ed) data = [{'url': '1'}, {'url': '2'}] while 1: proxy = util.get_prolist(10) for _data in data: url = _data.get('url', '') if not url: continue fetch_data(url=url, proxy=proxy, headers=default_headers, **kwargs) ''' #根据url规律进行控制 ''' for str_time in date_list: pass if not interval: break print('-------------- sleep %s sec -------------' % interval) time.sleep(interval)
def parse_resp(self, resp): ''' 第一层处理,类别获取后进行下一层抓取 :param resp: :return: ''' item = GoodsItem() category = [] date_list = util.specified_date(self.start_date, end_date=self.end_date) for category_url in category: if self.abbreviation and self.abbreviation not in category_url: # 非指定的数据不进行抓取(指定彩种的情况下使用该选项) continue ''' 抓取规则 ''' today_url = '' # 获取保存的数据库 result_key = category_url.split('-')[1] demo_test = config.PKS_KEY_DICT.get(result_key, '') for history_date in date_list: date_time = ''.join(history_date.split('-')) url = today_url.replace('today', date_time) yield scrapy.Request(url=url, headers=self.headers, callback=self.parse_product, meta={'item': item})