Esempio n. 1
0
 def start_requests(self):
     try:
         self.nowData = str(datetime.datetime.now())[0:10]
         self.getDataDate = ['2019', '2018', '2017']
         self.headers = {
             "Host": " www.yoka.com",
             "User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv":
             "68.0) Gecko/20100101 Firefox/68.0",
             "Accept":
             " text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
             "Accept-Language":
             " zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
             "Accept-Encoding": " gzip, deflate",
             "Connection": " keep-alive"
         }
         item = YokaSpiderItem()
         # item = {}
         item['site_name'] = '优卡网'
         item['domain'] = 'www.yoka.com'
         item['domain_url'] = 'http://www.yoka.com/'
         item['first_title'] = '优卡网-时尚'
         item['first_title_url'] = 'http://fashion.yoka.com/'
         yield scrapy.Request(method="GET",
                              url=item['first_title_url'],
                              headers=self.headers,
                              callback=self.parse_info,
                              meta={'item': item})
         print('时尚栏目板块数据抓取-----------------------------------')
     except Exception as e:
         print("start_requests:{}".format(e))
         logger.info("start_requests:{}".format(e))
Esempio n. 2
0
def get_time():
    try:
        # 发布时间格式处理
        print(111)
        a=2
        b = a / 0
    except Exception as e:
        logger.info("get_release_time:{}".format(e))
        print("get_release_time:{}".format(e))
    finally:
        print('222222222')
Esempio n. 3
0
 def start_requests(self):
     try:
         # 第一版板块
         # 进行起始 url 的拼接
         url_list = {
             'http://fashion.yoka.com/': '时尚',
             'http://beauty.yoka.com/': '美容',
             'http://luxury.yoka.com/': '奢华',
             'http://star.yoka.com/': '明星',
             'http://life.yoka.com/': '乐活',
             'http://www.yokamen.cn/': '男士',
             'http://www.yoka.com/video/': '视频',
             'http://www.yoka.com/z/': '独家',
             'http://bbs.yoka.com/': '社区',
             'http://brand.yoka.com/': '品牌'
         }
         # first_title = ['时尚', '美容', '奢华', '明星', '乐活', '男士', '视频', '独家', '社区', '品牌']
         self.nowData = str(datetime.datetime.now())[0:10]
         self.getDataDate = ['2019-07']
         self.headers = {
             "Host": " www.yoka.com",
             "User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv":
             "68.0) Gecko/20100101 Firefox/68.0",
             "Accept":
             " text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
             "Accept-Language":
             " zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
             "Accept-Encoding": " gzip, deflate",
             "Connection": " keep-alive"
         }
         # for key, value in url_list.items():
         # 一级栏目
         item = YokaSpiderItem()
         # item = {}
         item['site_name'] = '优卡网'
         item['domain'] = 'www.yoka.com'
         item['domain_url'] = 'http://www.yoka.com/'
         item['first_title'] = '优卡网-首页-' + url_list[
             'http://beauty.yoka.com/']
         item['first_title_url'] = 'http://beauty.yoka.com/'
         yield scrapy.Request(method="GET",
                              url=item['first_title_url'],
                              headers=self.headers,
                              callback=self.parse,
                              meta={'item': item})
         print('美容栏目url请求发送完成-----------------------------------')
     except Exception as e:
         print("YokaBeautySpider.start_requests:{}".format(e))
         logger.info("YokaBeautySpider.start_requests:{}".format(e))
Esempio n. 4
0
 def get_release_time(self, item):
     try:
         # 发布时间格式处理
         split_list = item['link_url'].split('/')
         if split_list:
             release_time = split_list[5] + '-' + split_list[
                 6][:2] + '-' + split_list[6][2:]
             item[
                 'release_time'] = release_time if release_time else self.nowData
         else:
             item['release_time'] = self.nowData
     except Exception as e:
         item['release_time'] = self.nowData
         print("get_release_time:{}".format(e))
         logger.info("get_release_time:{}".format(e))
Esempio n. 5
0
 def get_release_time(self, item):
     try:
         # 发布时间格式处理
         split_list = item['link_url'].split('/')
         if split_list:
             release_time = split_list[-3] + '-' + split_list[-2][0:2] + '-' + split_list[-2][2:4]
             item['release_time'] = release_time
             return release_time[0:7]
     except Exception as e:
         print("item", item)
         item['release_time'] = self.nowData
         with open('error_beauty_detail_url.txt', 'a') as f:
             f.write((str(item)) + '\n')
         print("get_release_time:{}".format(e))
         logger.info("get_release_time:{}".format(e))
     return []