def start_requests(self): try: self.nowData = str(datetime.datetime.now())[0:10] self.getDataDate = ['2019', '2018', '2017'] self.headers = { "Host": " www.yoka.com", "User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv": "68.0) Gecko/20100101 Firefox/68.0", "Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": " zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Accept-Encoding": " gzip, deflate", "Connection": " keep-alive" } item = YokaSpiderItem() # item = {} item['site_name'] = '优卡网' item['domain'] = 'www.yoka.com' item['domain_url'] = 'http://www.yoka.com/' item['first_title'] = '优卡网-时尚' item['first_title_url'] = 'http://fashion.yoka.com/' yield scrapy.Request(method="GET", url=item['first_title_url'], headers=self.headers, callback=self.parse_info, meta={'item': item}) print('时尚栏目板块数据抓取-----------------------------------') except Exception as e: print("start_requests:{}".format(e)) logger.info("start_requests:{}".format(e))
def get_time(): try: # 发布时间格式处理 print(111) a=2 b = a / 0 except Exception as e: logger.info("get_release_time:{}".format(e)) print("get_release_time:{}".format(e)) finally: print('222222222')
def start_requests(self): try: # 第一版板块 # 进行起始 url 的拼接 url_list = { 'http://fashion.yoka.com/': '时尚', 'http://beauty.yoka.com/': '美容', 'http://luxury.yoka.com/': '奢华', 'http://star.yoka.com/': '明星', 'http://life.yoka.com/': '乐活', 'http://www.yokamen.cn/': '男士', 'http://www.yoka.com/video/': '视频', 'http://www.yoka.com/z/': '独家', 'http://bbs.yoka.com/': '社区', 'http://brand.yoka.com/': '品牌' } # first_title = ['时尚', '美容', '奢华', '明星', '乐活', '男士', '视频', '独家', '社区', '品牌'] self.nowData = str(datetime.datetime.now())[0:10] self.getDataDate = ['2019-07'] self.headers = { "Host": " www.yoka.com", "User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv": "68.0) Gecko/20100101 Firefox/68.0", "Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": " zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Accept-Encoding": " gzip, deflate", "Connection": " keep-alive" } # for key, value in url_list.items(): # 一级栏目 item = YokaSpiderItem() # item = {} item['site_name'] = '优卡网' item['domain'] = 'www.yoka.com' item['domain_url'] = 'http://www.yoka.com/' item['first_title'] = '优卡网-首页-' + url_list[ 'http://beauty.yoka.com/'] item['first_title_url'] = 'http://beauty.yoka.com/' yield scrapy.Request(method="GET", url=item['first_title_url'], headers=self.headers, callback=self.parse, meta={'item': item}) print('美容栏目url请求发送完成-----------------------------------') except Exception as e: print("YokaBeautySpider.start_requests:{}".format(e)) logger.info("YokaBeautySpider.start_requests:{}".format(e))
def get_release_time(self, item): try: # 发布时间格式处理 split_list = item['link_url'].split('/') if split_list: release_time = split_list[5] + '-' + split_list[ 6][:2] + '-' + split_list[6][2:] item[ 'release_time'] = release_time if release_time else self.nowData else: item['release_time'] = self.nowData except Exception as e: item['release_time'] = self.nowData print("get_release_time:{}".format(e)) logger.info("get_release_time:{}".format(e))
def get_release_time(self, item): try: # 发布时间格式处理 split_list = item['link_url'].split('/') if split_list: release_time = split_list[-3] + '-' + split_list[-2][0:2] + '-' + split_list[-2][2:4] item['release_time'] = release_time return release_time[0:7] except Exception as e: print("item", item) item['release_time'] = self.nowData with open('error_beauty_detail_url.txt', 'a') as f: f.write((str(item)) + '\n') print("get_release_time:{}".format(e)) logger.info("get_release_time:{}".format(e)) return []