def get_room_info(self, checkin): """ 兼容新版age_info和room_info :return: """ room_info = list() if self.task.ticket_info.get('age_info'): for index, room in enumerate( self.task.ticket_info.get('age_info', [])): adult_info = [] child_info = [] for people in room: age = self.calculate_age(checkin, people) if age < 18: child_info.append(age) else: adult_info.append(age) room_info.append( dict(adult_info=adult_info, child_info=child_info)) else: room_info = self.task.ticket_info.get('room_info', []) # 判断每间房人员类型是否相同,不同就报错 for index, room in enumerate(room_info): if index == 0: continue if len(room['adult_info']) != len( room_info[index - 1]['adult_info']): raise parser_except.ParserException(12, "房间类型不同") if len(room['child_info']) != len( room_info[index - 1]['child_info']): raise parser_except.ParserException(12, "房间类型不同") return room_info
def crawl_data(self, request_template, browser, source_name): """ 页面抓取函数 :param request_template: 请求字典 :param browser: 抓取浏览器 :param source_name: 源名称 :return: 返回抓取结果 response 对象 """ try: logger.debug(current_log_tag() + 'crawl %s, retry_count: %s', self.__request_func.__name__, self.req_count) # 代理装配 self.browser_set_proxy(browser, source_name) resp, self.content_length = self.__crawl_data_str( request_template, browser) # todo 修改 user_retry 返回的结果 if self.user_retry: try: user_check = self.spider.user_retry_err_or_resp( resp, self.req_count, request_template, False) except Exception as e: self.user_exc = True raise e # 当用户返回 True 时 if user_check: return resp else: raise parser_except.ParserException( parser_except.PROXY_INVALID, '代理异常') else: return resp except parser_except.ParserException as e: self.is_forbidden = e.code in (parser_except.PROXY_FORBIDDEN, parser_except.PROXY_FORBIDDEN, parser_except.REQ_ERROR) self.req_exception = e except Exception as e: self.req_exception = parser_except.ParserException( parser_except.REQ_ERROR, 'req exception:{0}'.format(e)) # 如果有用户异常,则置位用户重试 if self.user_exc: if isinstance(e, parser_except.ParserException): self.req_exception = e finally: if self.req_exception: code = self.req_exception.code else: code = 0 if self.req_exception: raise self.req_exception
def response_error(self, req, resp, error): if resp.status_code == 400: import re try: message = re.search(r'<Message>(.*)</Message>', resp.text).group(1) except Exception: message = "" raise parser_except.ParserException(29, message) raise parser_except.ParserException( 89, "服务出错啦啊啊~~!http code: {}".format(req['resp'].status_code))
def task_parser(self): """ 接受任务,并解析,处理异常 :return: """ task = self.task try: contentlist = self.split_content(task.content) mj_city_id, hotel_id = contentlist[:2] days = int(contentlist[2]) checkin = datetime.datetime.strptime(contentlist[-1], "%Y%m%d") checkout = checkin + datetime.timedelta(days=days) checkin_str = checkin.strftime("%Y-%m-%d") checkout_str = checkout.strftime("%Y-%m-%d") self.check_in = checkin_str ticket_info = task.ticket_info env_name = ticket_info.get("env_name") try: room_info = self.get_room_info(checkin_str) child_age = [ age for age in room_info[0]['child_info'] if age < 18 ] adult = len(room_info[0]['adult_info']) + len( [age for age in room_info[0]['child_info'] if age >= 18]) child = len(child_age) self.user_datas['adult_num'] = adult self.user_datas['child_num'] = child room_count = len(room_info) except Exception as e: raise parser_except.ParserException(12, '任务错误') except Exception as e: raise parser_except.ParserException(ErrNumber.E__TASK, str(e)) redis_key = 'Null' if hasattr(task, 'redis_key'): redis_key = task.redis_key try: auth = json.loads(task.ticket_info["auth"]) # except parser_except.ParserException: except Exception: raise parser_except.ParserException(121, msg='API认证信息错误') request_info = dict(HotelID=hotel_id, CheckInDate=checkin_str, CheckOutDate=checkout_str, city=mj_city_id, RoomCount=room_count, Adult=adult, Children=child, ChildrenAge=child_age, Nationality='CN', env_name=env_name, redis_key=redis_key, auth=auth) return request_info
def w_get_proxy(debug, source, task, verify_info): if debug and not slave_get_proxy: print('debug,and not define get_proxy,so can’t get proxy ') return None p = slave_get_proxy(source=source, task=task, verify_info=verify_info) if not p: raise parser_except.ParserException(parser_except.PROXY_NONE, f'get {source} proxy None') return p
def __crawl_by_chain(self, chains): """ 根据请求链的类型,进入不同的抓取顺序进行抓取 :param chains: :return: """ code = 0 try: for reqParse in chains: # gevent.sleep(0) browser = self.__create_browser(reqParse.new_session) reqParse.spider = self t_req = reqParse.request() if isinstance(t_req, dict): # 单一请求 new_result = self.__single_crawl(reqParse, browser, t_req, 0) elif isinstance(t_req, list): # 爬虫有可能返回一个空列表!!! if t_req: if reqParse.asynchronous: # 并行抓取 list_result = self.__async_crawl_list( reqParse, browser, t_req) else: # 串行请求 list_result = self.__crawl_list( reqParse, browser, t_req) new_result, code = self.check_list_result( list_result, code) # $$$ 可以优化 elif isinstance(t_req, types.GeneratorType): # 针对使用的yelid 调用方法的请求 list_result = self.__crawl_list(reqParse, browser, t_req) new_result, code = self.check_list_result( list_result, code) self.__spider_append_result(new_result) if self.use_selenium and browser.br: browser.close() except parser_except.ParserException as e: if self.use_selenium and browser.br: browser.close() logger.error(e) raise e except Exception: if self.use_selenium and browser.br: browser.close() logger.exception(current_log_tag() + '[新框架 持续请求 未知问题][ {0} ]'.format( traceback.format_exc().replace('\n', '\t'))) raise parser_except.ParserException( parser_except.UNKNOWN_ERROR, 'e:{0}'.format(traceback.format_exc())) return code
def convert(self, request_template, data): data_con = request_template.get('data', {}) c_type = data_con.get('content_type', 'string') logger.debug(current_log_tag() + 'Converter got content_type: %s', c_type) if c_type is 'html': return HTML.fromstring(data) elif c_type is 'json': return json.loads(data) elif isinstance(c_type, types.MethodType): try: return c_type(request_template, data) except: raise parser_except.ParserException( -1, 'convert func muset error{0} ,func:{1}'.format( traceback.format_exc(), c_type)) else: return data
def accor_parser(content, url, other_info): hotel = HotelBase() data = content.decode('utf-8') if '<title>Book a hotel online with Accor Hotels</title>' in data: raise parser_except.ParserException(29, '网站暂时维护中') hotel_code = re.findall( 'https://www.accorhotels.com/zh/hotel-(.*?)-.*?/index.shtml', url)[0].lower() hotel_url = url source = 'accorHotel' source_city_id = 'NULL' brand_name = "NULL" _star = re.findall( '<div class="main-rating stars stars--(\d+)"\s*data-halfstars=', data) star = _star[0] if _star != [] else -1 postal_code = get_blank( re.findall('<meta content="(.*?)" property="og:postal-code">', data)) hotel_name = re.findall('<meta name="twitter:title" content="(.*?)">', data)[0] hotel_name_en = "NULL" map_info = ",".join( re.findall('<meta content="(.*?)" name="geo.position"/>', data)[0].split(';')[::-1]) street = re.findall('<span itemprop="streetAddress">(.*?)</span><br>', data)[0] location = re.findall('<span itemprop="addressLocality">(.*?)</span><br>', data)[0] _country = re.findall('<span itemprop="addressCountry">(.*?)</span>', data)[0] address = _country + location + street country = re.findall('<meta content="(.*?)" property="og:country-name">', data)[0] city = get_blank( re.findall('<meta content="(.*?)" property="og:city">', data)) _grade = re.findall( '<span class="rating"><span itemprop="ratingValue">\s*(.*?)</span>/<span itemprop="bestRating">5</span>\s*</span>', data) grade = _grade[0] if _grade != [] else -1.0 review = get_blank( re.findall('<span class="rating-baseline">(.*?)</span>', data)) review_num = "".join(re.findall('\d+', review)) or -1 has_wifi = 'Yes' if re.findall('<i\s*class="icon icon_wifi"></i>', data) else 'No' if has_wifi == 'Yes': is_wifi_free = 'Yes' if re.findall( '<li\s*class="service-item "\s*data-servicename="wifi">', data) else 'No' else: is_wifi_free = 'NULL' has_parking = 'Yes' if re.findall('<i\s*class="icon icon_parking"></i>', data) else 'No' if has_wifi == 'Yes': is_parking_free = 'No' if re.findall( '<li\s*class="service-item\s*payable"\s*data-servicename="parking">', data) else 'Yes' else: is_parking_free = 'NULL' img_items = "|".join( re.findall( 'www.ahstatic.com/photos/' + hotel_code + '_\w+_\d+_p_2048x1536.jpg', data)) source_id = other_info['source_id'] city_id = other_info['city_id'] first_img = None if img_items: first_img = img_items.split('|')[0] others_info = { "city": city, "country": country, "first_img": first_img, "source_city_id": source_city_id } service = pq(data)('div.expandable-content').find('li').text().replace( "\t", "").replace("\n", "").replace(" ", "|") description = HTML(data).xpath("//p[@itemprop='description']/text()")[0] accepted_cards = "NULL" check_in_time = get_blank( re.findall('<i class="icon icon_times"></i>(.*?)</div>', data)) check_out_time = get_blank( re.findall('<div class="col col-checkout">(.*?)</div>', data)) hotel.hotel_name = hotel_name hotel.hotel_name_en = hotel_name_en hotel.source = source hotel.source_id = source_id hotel.source_city_id = source_city_id hotel.brand_name = brand_name hotel.map_info = map_info hotel.address = address hotel.city = city hotel.country = country hotel.city_id = city_id hotel.postal_code = postal_code hotel.star = star hotel.grade = grade hotel.review_num = review_num hotel.has_wifi = has_wifi hotel.is_wifi_free = is_wifi_free hotel.has_parking = has_parking hotel.is_parking_free = is_parking_free hotel.service = service hotel.img_items = img_items hotel.description = description hotel.accepted_cards = accepted_cards hotel.check_in_time = check_in_time hotel.check_out_time = check_out_time hotel.hotel_url = hotel_url hotel.others_info = json.dumps(others_info) return hotel
def get_proxy( source=None, allow_ports=[], forbid_ports=[], allow_regions=[], forbid_regions=[], user='******', passwd='realtime', proxy_info={}, verify_info="verify", ip_num=1, ip_type="internal", task=Task(), ): """ 全都需要取代理暂时 """ qid = str(task.ticket_info.get('qid', int(time.time() * 1000))) msg = { "req": [{ "source": source, "type": verify_info, "num": ip_num, "ip_type": ip_type, }] } msg = json.dumps(msg) ptid = task.ticket_info.get('ptid', "") time_st = time.time() get_info = '/?type=px001&qid={0}&query={1}&ptid={2}&tid=tid&ccy=AUD'.format( qid, msg, ptid) logger.info("get proxy info :http://{1}{0}".format(get_info, g_config.proxy_host)) count = 1 while 1: try: p = requests.get("http://{0}".format(g_config.proxy_host) + get_info, timeout=(6, 6), stream=False) p_time = p.elapsed.total_seconds() p = p.content logger.info("代理返回内容为{0}".format(p)) proxy_ip = json.loads(p)['resp'][0]['ips'][0]['inner_ip'] break except: exstr = traceback.format_exc() msg = '取代理请求时报错,错误信息为:' + exstr info = warn(qid, 'ex_GetProxyFail', ip, msg) logger.debug("\n" + info) if count == 3: raise parser_except.ParserException(21, "取代理时失败") time.sleep(3) logger.debug("取代理失败,进行第{}次重试,".format(count)) count += 1 time_end = time.time() - time_st # 代理服务有时候会返回一个只有":"的代理! if len(proxy_ip) < 9: msg = "获取到的代理不可用!" info = warn(qid, 'ex_GetProxyFail', ip, msg) logger.debug("\n" + info) raise parser_except.ParserException(21, "获取到的代理有误:{}".format(p)) if not proxy_ip: msg = '未获取到代理,请求信息为:' + get_info info = warn(qid, 'ex_GetProxyFail', ip, msg) logger.debug("\n" + info) raise parser_except.ParserException(21, "未获取到代理") if p_time > 1.5: msg = '获取代理成功耗时, 耗时:{0}, requests 记录超时时间:{1}'.format(time_end, p_time) info = warn(qid, 'ex_GetProxyFail', ip, msg) logger.debug("\n" + info) p = [proxy_ip, [p, time_end, get_info]] return p
def __crawl_data_str(self, request_template, browser): resp = None try: # 使用方法修改,用户直接修改 request_template 中的值 self.spider.prepare_request(request_template) # 获得 request_template 中的 req req = request_template['req'] # 用于控制qps if hasattr(self.spider, 'queue_info'): browser.queue_info = self.spider.queue_info if hasattr(self.spider.task, 'req_qid'): browser.qid = self.spider.task.req_qid else: browser.qid = "" browser.task_id = self.spider.task.task_id browser.source = self.spider.task.source browser.tid = self.spider.task.tid browser.ori_type = self.spider.task.ori_type resp = browser.req(**req) # 网络错误,异常抛出 resp.raise_for_status() content_length = len(resp.content) if isinstance(self.need_content_length, int): logger.debug(current_log_tag() + '[爬虫 content_length={1} 检测][页面长度需要大于 {0}]'.format( self.need_content_length, content_length)) if content_length <= self.need_content_length: raise parser_except.ParserException( parser_except.PROXY_INVALID, msg='data is empty') elif self.need_content_length is None: logger.debug(current_log_tag() + '[爬虫无需 content_length 检测]') else: logger.debug(current_log_tag() + '[未知 content_length 检测类型][type: {0}]'.format( str(type(self.need_content_length)))) return resp, content_length # timeout except requests.exceptions.SSLError as e: self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_SSL, msg=str(e), error=e) except requests.exceptions.ProxyError as e: # 代理失效 self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg='Proxy Error', error=e) except requests.exceptions.ConnectTimeout as e: self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN, msg='Request connect Timeout', error=e) except requests.exceptions.ReadTimeout as e: self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN, msg='Request read Timeout', error=e) except requests.exceptions.Timeout as e: self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN, msg='Request Timeout', error=e) except requests.exceptions.ConnectionError as err: self.spider.response_error(request_template, resp, err) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg=str(err)) except requests.exceptions.HTTPError as err: # 4xx 5xx 的错误码会catch到 self.spider.response_error(request_template, resp, err) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg=str(err), error=err) except requests.exceptions.RequestException as err: # 这个是总的error self.spider.response_error(request_template, resp, err) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg=str(err), error=err) except Exception as e: # 这个是最终的error self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg=traceback.format_exc())
def parse_detail(self, req, resp): tree = etree.HTML(resp) req_url = req['req']['url'] # print req['req']['url'] self.item['source'] = 'shangrila' self.item['brand_name'] = '香格里拉' if 'about' in req_url: if 'service' in req_url: hotel2 = Hotel_New() try: service_all = tree.xpath( "//div[@class='control2_1column']/ul/li/text()") # facilities_dict = {'Swimming_Pool': '泳池', 'gym': '健身', 'SPA': 'SPA', 'Bar': '酒吧', 'Coffee_house': '咖啡厅', # 'Tennis_court': '网球场', 'Golf_Course': '高尔夫球场', 'Sauna': '桑拿', 'Mandara_Spa': '水疗中心', # 'Recreation': '儿童娱乐场', 'Business_Centre': '商务中心', 'Lounge': '行政酒廊', # 'Wedding_hall': '婚礼礼堂', 'Restaurant': '餐厅', 'Parking': '停车场', # 'Airport_bus': '机场', 'Valet_Parking': '代客泊车', 'Call_service': '叫车服务', # 'Rental_service': '租车服务', 'Room_wifi': '客房无线网络', 'Room_wired': '客房有线网络', # 'Public_wifi': '公共区域无线上网', 'Public_wired': '公共区域有线网络'} facilities_dict = { 'Swimming_Pool': ['游泳池'], 'gym': ['健身房'], 'SPA': ['SPA'], 'Bar': ['酒吧'], 'Coffee_house': ['咖啡厅'], 'Tennis_court': ['网球场'], 'Golf_Course': ['高尔夫球场'], 'Sauna': ['桑拿'], 'Mandara_Spa': ['水疗中心'], 'Recreation': ['儿童娱乐场', '儿童游乐场'], 'Business_Centre': ['商务中心'], 'Lounge': ['行政酒廊'], 'Wedding_hall': ['婚礼礼堂'], 'Restaurant': ['餐厅'], 'Airport_bus': ['机场班车', '班车服务', '班车服务(收费)'], 'Valet_Parking': ['代客泊车'], 'Call_service': ['叫车服务'], 'Rental_service': ['租车服务'], 'Room_wifi': ['客房无线网络'], 'Room_wired': ['客房有线网络'], 'Public_wifi': ['公共区域无线上网'], 'Public_wired': ['公共区域有线网络'] } # reverse_facility_dict = {v: k for k, v in facilities_dict.items()} service_dict = { 'Luggage_Deposit': '行李寄存', 'front_desk': '24小时前台', 'Lobby_Manager': '24小时大堂经理', '24Check_in': '24小时办理入住', 'Security': '24小时安保', 'Protocol': '礼宾服务', 'wake': '叫醒服务', 'Chinese_front': '中文前台', 'Postal_Service': '邮政服务', 'Fax_copy': '传真/复印', 'Laundry': '洗衣', 'polish_shoes': '擦鞋服务', 'Frontdesk_safe': '保险', 'fast_checkin': '快捷入住及退房服务', 'ATM': '自动柜员机(ATM)/银行服务', 'child_care': '儿', 'Food_delivery': '送餐服务' } reverse_sevice_dict = { v: k for k, v in service_dict.items() } for service in service_all: for keys, fac_value in facilities_dict.items(): if fac_value in service: service = self.clean_data(service) if keys in hotel2.facility: hotel2.facility[keys] = hotel2.facility[ keys] + ',' + service else: hotel2.facility[keys] = service for sev_value in service_dict.values(): if sev_value in service: service = self.clean_data(service) hotel2.service[ reverse_sevice_dict[sev_value]] = service self.item['service'] = hotel2.service self.item['facility'] = hotel2.facility except Exception as e: self.item['service'] = "NULL" self.item['facility'] = "NULL" elif 'map' in req_url: try: latitude = re.compile(r'"Lat":"(.*?)"', re.S).findall(resp)[0] longitude = re.compile(r'"Lng":"(.*?)"', re.S).findall(resp)[0] except: raise parser_except.ParserException(22, '代理失效') self.item['latitude'] = latitude self.item['longitude'] = longitude map_list = tree.xpath( "//div[@class='control2_1column']/div[@class='map-list']/div/h4/text()" ) self.item['traffic'] = "NULL" traffic_str_all = "" index = 1 for tra_str in map_list: # if tra_str == "公共交通": traffic_str_l = tree.xpath( "//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()" .format(index)) traffic_str = " ".join(traffic_str_l).strip().replace( " ", "") traffic_str_all += tra_str + ":" + traffic_str # if tra_str == "机场交通": # traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index)) # traffic_str = " ".join(traffic_str_l).strip().replace(" ", "") # traffic_str_all += tra_str + ":" + traffic_str # if tra_str == "地铁": # traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index)) # traffic_str = " ".join(traffic_str_l).strip().replace(" ", "") # traffic_str_all += tra_str + ":" + traffic_str # if tra_str == "出租车": # traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index)) # traffic_str = " ".join(traffic_str_l).strip().replace(" ", "") # traffic_str_all += tra_str + ":" + traffic_str # if tra_str == "高速磁悬浮列车": # traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index)) # traffic_str = " ".join(traffic_str_l).strip().replace(" ", "") # traffic_str_all += tra_str + ":" + traffic_str # if tra_str == "酒店豪华桥车": # traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index)) # traffic_str = " ".join(traffic_str_l).strip().replace(" ", "").replace('\r', '').replace('\n', '') # traffic_str_all += tra_str + ":" + traffic_str self.item['traffic'] = traffic_str_all index += 1 return elif 'reviews' in req_url: self.flag = True try: link = tree.xpath( '//iframe[contains(@id, "ChildFrame")]/@src')[0] except: raise parser_except.ParserException(22, 'proxy error') self.review_url = link if 'http' not in link: self.review_url = 'http:' + link self.review_url = self.review_url.strip()
def parse_index(self, req, resp): req_url = req['req']['url'] # print req_url # print req_url if 'NavigationMainMenuJson' in req_url: # print resp node_list = resp['MainMenu'] self.info_list.extend([ 'http://www.shangri-la.com{}'.format(node['Url']) for node in node_list if 'about' in node['Url'] or 'reviews' in node['Url'] ]) elif 'NavigationJson' in req_url: # print resp try: node_list = resp['NaviMenu'] self.info_list.extend([ 'http://www.shangri-la.com{}'.format(node['Url']) for node in node_list if 'map' in node['Url'] or 'service' in node['Url'] ]) except KeyError: raise parser_except.ParserException( 22, '请求失效,失效url为:{}'.format(req_url)) else: tree = etree.HTML(resp) description = re.compile(r'<p>(.*?)</p>').findall(resp) description_info = '' for des in description: if u'本酒店可接受以下信用卡付款' in des or u'退房时用信用卡结账需' in des: pass else: description_info += des # print description_info self.item['description'] = description_info try: hotel_phone = tree.xpath( "//span[@id='ctl00_ContentPlaceHolder1_ltrPhone']/text()" )[0] self.item['hotel_phone'] = hotel_phone except Exception as e: self.item['hotel_phone'] = 'NULL' self.item['source_id'] = self.hotel_code self.img_url = 'http://www.shangri-la.com/HotelPhotoVideoJson.json?hotel_code={}&lang=cn'.format( self.hotel_code) # hotel_name_start = tree.xpath('//div[@class="logoOverLayer"]/img/@alt')[0] # print resp try: hotel_name_info = tree.xpath( '//meta[@property="og:title"]/@content')[0] except: raise parser_except.ParserException(22, 'proxy error') # title = tree.xpath('//title/text()') # print title # print hotel_name_info if '五星级' in hotel_name_info: self.item['star'] = 5 elif '四星级' in hotel_name_info: self.item['star'] = 4 elif '三星级' in hotel_name_info: self.item['star'] = 3 else: self.item['star'] = '' # hotel_name = hotel_name_info.split('|')[-1] # self.item['hotel_name'] = hotel_name self.item['hotel_name_en'] = self.hotel_name try: # print resp post_code = tree.xpath( '//div[@class="widget-mid"]//span[@id="ctl00_ContentPlaceHolder1_ltrAddress"]/text()' ) if not post_code: post_code = tree.xpath( '//span[@id="ctl00_ContentPlaceHolder1_ltrAddress"]/text()' ) if len(post_code) <= 1: p_codes = post_code[0].split(',') try: p_code = p_codes[-2] n = re.compile(r'(\d+)').findall(p_code) if not n: p_code = '' except: p_code = '' address = post_code[0] else: address = post_code[0] p_code = re.compile(r'\d+').findall(post_code[1])[0] except: post_code = '' p_code = '' address = '' self.item['city'] = self.city self.item['postal_code'] = p_code self.item['address'] = address print req['req']['url'] # print resp try: self.img_first = 'http://www.shangri-la.com{}'.format( tree.xpath('//div[@id="background"]/img/@src')[0]) except: self.img_first = '' try: check_time = tree.xpath( '//span[@id="ctl00_ContentPlaceHolder1_ltrChkInOut"]/text()' ) if not check_time: check_time = tree.xpath( '//span[@id="ctl00_ContentPlaceHolder1_ltrChkInOut"]/p/text()' ) if check_time[1] == u'\xa0': check_time1 = tree.xpath( '//span[@id="ctl00_ContentPlaceHolder1_ltrChkInOut"]/p/text()' )[0] check_time2 = tree.xpath( '//span[@id="ctl00_ContentPlaceHolder1_ltrChkInOut"]/p/span/text()' )[0] check_time = [check_time1, check_time2] try: # print check_time[0] self.item['check_in_time'] = check_time[0].split(':')[1] self.item['check_out_time'] = check_time[1].split(':')[1] except: self.item['check_in_time'] = '' self.item['check_out_time'] = '' except: self.item['check_in_time'] = '' self.item['check_out_time'] = '' try: accepted_card_info = tree.xpath( '//span[contains(@id, "ctl00_ContentPlaceHolder1_ltrPayment")]/p/text()' ) if not accepted_card_info: accepted_card_info = tree.xpath( '//span[contains(@id, "ctl00_ContentPlaceHolder1_ltrPayment")]/text()' ) if len(accepted_card_info) <= 1: accepted_card_infos = accepted_card_info[0].replace( ':', ':') accepted_cards = accepted_card_infos.split(':')[-1].replace('、', '|').replace(',', '|'). \ replace(u'及', "|").replace('。', '') else: accepted_cards = accepted_card_info[-1].replace('、', '|').replace(',', '|'). \ replace(u'及', "|").replace('。', '') except: accepted_cards = '' self.item['accepted_cards'] = accepted_cards
def parse_English_hotel(self, req, resp): data = etree.HTML(resp) try: self.hotel_test['hotel_name_en'] = data.xpath( "//p[@class='homePropertyName']//text()")[0] except: raise parser_except.ParserException(22, '代理失效,重试') self.hotel_test['source'] = 'hyatt' self.hotel_test['brand_name'] = '凯悦' self.hotel_test['source_id'] = re.findall(r"var spiritCode='(.*?)'", resp, re.S)[0].encode('utf8') latitude = re.findall(r'"latitude" : "(.*?)"', resp, re.S)[0].encode('utf8') longitude = re.findall(r'"latitude" : "(.*?)"', resp, re.S)[0].encode('utf8') self.hotel_test['map_info'] = longitude + ',' + latitude self.hotel_test['address'] = data.xpath( "//p[@class='address']//text()")[0] self.hotel_test['hotel_city'] = re.findall(r'hotel_city:"(.*?)"', resp, re.S)[0].encode('utf-8') self.hotel_test['hotel_country'] = data.xpath( "//p[@class='address']/span[1]/text()")[0] try: self.hotel_test['hotel_postal_code'] = data.xpath( "//p[@class='address']/span[2]/text()")[0] except: self.hotel_test['hotel_postal_code'] = 'NULL' self.hotel_test['star'] = 5 self.hotel_test['grade'] = self.hotel_test['grade'] self.hotel_test['review_num'] = self.hotel_test['review_num'] wifi = data.xpath( "//img[@src='/content/dam/PropertyWebsites/andaz/nycaw/Media/All/xBHSG_RightRail_Four_101216.png.pagespeed.ic.1FAby01L8_.png']" ) if len(wifi): # self.hotel_test['has_wifi'] = 'YES' self.hotel_test['has_wifi'] = 'Free WiFi' self.hotel_test['is_wifi_free'] = 'YES' else: # self.hotel_test['has_wifi'] = 'NULL' self.hotel_test['has_wifi'] = '' self.hotel_test['is_wifi_free'] = 'NULL' self.hotel_test['has_parking'] = 'NULL' self.hotel_test['is_parking_free'] = 'NULL' # self.hotel_test['services'] = 'NULL' self.hotel_test['services'] = '' imgs = data.xpath( "//div[@class='carousel fullWidth floatL']//a/img/@src")[0] url = self.url_en.split('/en')[0] self.hotel_test['img_items'] = url + imgs self.hotel_test['description'] = data.xpath( "//div[@class='readMoreContent']//text()")[0] while '\n' in self.hotel_test['description']: self.hotel_test['description'].remove('\n') self.hotel_test['accepted_cards'] = 'NULL' self.hotel_test['check_in_time'] = self.hotel_test['check_in_time'] self.hotel_test['check_out_time'] = self.hotel_test['check_out_time'] self.hotel_test['hotel_url'] = self.url_en try: self.hotel_test['Img_first'] = self.url + data.xpath( "//div[@class='carousel fullWidth floatL']//@src")[0] except: self.hotel_test['Img_first'] = '' phone = data.xpath("//p[@class='phnNo']/text()")[0].split('+')[1] self.hotel_test['hotel_phone'] = phone.replace(' ', '')
def parse_room_util(self, req, resp): """ 解析方法的工具方法,用于parse_Room和parse_verifyRoom :return: """ if "Invalid Auth" in resp: if json.loads(self.task.ticket_info['auth']).get( 'apienv', 'test') == 'online': use_record_api(task=self.task, api_name='PriceSearchRequest', unionkey='daolvApi', record_tuple=1, error_id=122, api_info={}, msg='', httpcode=req['resp'].status_code, resp='', is_success=1) raise parser_except.ParserException(122, '认证信息失败') if "Location not found for given descriptor and type" in resp: if json.loads(self.task.ticket_info['auth']).get( 'apienv', 'test') == 'online': use_record_api(task=self.task, api_name='PriceSearchRequest', unionkey='daolvApi', record_tuple=1, error_id=99, api_info={}, msg='', httpcode=req['resp'].status_code, resp='', is_success=1) raise parser_except.ParserException( 99, 'Location not found for given descriptor and type') request_info = self.task_parser() task = self.task redis_key = request_info['redis_key'] result = [] try: doc = etree.XML(resp) RatePlans = doc.xpath('.//RatePlan') hotel_name = self.get_first_str( doc.xpath('.//Hotel/HotelName/text()')) hotel_id = self.get_first_str(doc.xpath('.//Hotel/HotelID/text()')) CityID = str( self.get_first_str( doc.xpath('.//Hotel/Destination/@CityCode'))) for offer_count, each in enumerate(RatePlans): try: room = Room() room.hotel_name = hotel_name room.city = request_info.get('city') room.source = 'daolvApi' room.source_hotelid = hotel_id room.real_source = room.source room.room_type = self.get_first_str( each.xpath('./RatePlanName/text()')) room.occupancy = int( self.get_first_str( each.xpath('./MaxOccupancy/text()'))) bed_type = self.get_first_str( each.xpath('./BedType/text()')) """ 返回的信息中给出的是一个床型id,需要在api中查询具体的信息,因为更新频率低,写成了dict信息存放,更新时 更新同目录下的bed_type_json.py即可 """ bed_type = bed_type_dict.get(str(bed_type), '暂时没有对应信息') if isinstance(bed_type, list): room.bed_type = '{0}; {1}'.format( bed_type[0], bed_type[1]) else: room.bed_type = bed_type room.check_in = request_info.get('CheckInDate') room.check_out = request_info.get('CheckOutDate') try: room.rest = int( self.get_first_str( each.xpath('./InventoryCount/text()'))) except Exception as e: room.rest = len(self.task.ticket_info['room_info']) room.price = float( self.get_first_str(each.xpath('./TotalPrice/text()'))) room.currency = each.find('Currency').text room.has_breakfast = self.has_breakfast( self.get_first_str( each.xpath('./BreakfastType/text()'))) room.is_breakfast_free = room.has_breakfast return_rule = each.xpath( './RatePlanCancellationPolicyList/CancellationPolicy') room.return_rule = self.return_rule_str(return_rule) if room.return_rule == 'NULL': room.is_cancel_free = 'No' room.return_rule = '' room.others_info = json.dumps({ "rate_key": self.get_first_str(each.xpath('./RatePlanID/text()')), "room_num": len(self.task.ticket_info["room_info"]), "payment_info": "", "rating": "", "payKey": { "redis_key": redis_key, "uid": getuid(), "id": offer_count, }, 'extra': { 'breakfast': self.get_first_str( each.xpath("./BreakfastType/text()")), 'payment': '', 'size_info': '', 'return_rule': room.return_rule, 'occ_des': str(room.occupancy), 'occ_num': { 'adult_num': room.occupancy, 'child_num': 0, }, 'size_info_extra': 0 } }) room.pay_method = 'mioji' self.room_obj_list.append(room) room_tuple = (room.hotel_name, room.city, room.source, room.source_hotelid, room.source_roomid, room.real_source, room.room_type, room.occupancy, room.bed_type, room.size, room.floor, room.check_in, room.check_out, room.rest, room.price, room.tax, room.currency, room.pay_method, room.is_extrabed, room.is_extrabed_free, room.has_breakfast, room.is_breakfast_free, room.is_cancel_free, room.room_desc, room.return_rule, room.extrabed_rule, room.change_rule, room.others_info, room.guest_info) result.append(room_tuple) except Exception as e: logger.error('field not comple %s\n', str(e)) raise parser_except.ParserException( ErrNumber.E_UNKNOWN, str(e)) except Exception as e: if json.loads(self.task.ticket_info['auth']).get( 'apienv', 'test') == 'online': use_record_api(task=self.task, api_name='PriceSearchRequest', unionkey='daolvApi', record_tuple=1, error_id=25, api_info={}, msg='', httpcode=req['resp'].status_code, resp='', is_success=0) raise parser_except.ParserException(ErrNumber.E_UNKNOWN, str(e)) if result == []: if json.loads(self.task.ticket_info['auth']).get( 'apienv', 'test') == 'online': use_record_api(task=self.task, api_name='PriceSearchRequest', unionkey='daolvApi', record_tuple=1, error_id=291, api_info={}, msg='', httpcode=req['resp'].status_code, resp='', is_success=0) raise parser_except.ParserException(291, '无房') elif result != []: if json.loads(self.task.ticket_info['auth']).get( 'apienv', 'test') == 'online': use_record_api(task=self.task, api_name='PriceSearchRequest', unionkey='daolvApi', record_tuple=1, error_id=0, api_info={}, msg='', httpcode=req['resp'].status_code, resp='', is_success=0) return result
def __single_crawl(self, reqParse, browser, request_template, page_count): """ 用于请求的基本方法 """ # 请求链中的header 可以被沿用 headers = request_template['req'].get('headers', None) use_headers = request_template['req'].get('use_headers', False) if headers: browser.add_header(headers, use_headers) # 设置 res 的 默认值 res = defaultdict(list) # 初始化请求参数 local_req_count = 0 reqParse.req_count = 0 reqParse.is_forbidden = False reqParse.req_exception = None reqParse.proxy = None reqParse.content_length = 0 self.__cpu_time += time.time() * 1000 while local_req_count < reqParse.retry_count: # 增加一次重试次数 local_req_count += 1 logger.debug( current_log_tag() + '[开始抓取][ {0} ]'.format(request_template['req'].get('url', ''))) # 外部传入请求次数,用于在 parse 过程中抛出的代理异常进行重新抓取 try: resp = reqParse.crawl_data(request_template, browser, self.task.source) except parser_except.ParserException as e: traceback.print_exc() if reqParse.user_exc: # 抛出用户在函数中抛出的错误 raise e # 错误码21/22/23 或 开发指定需要重试 if e.code in (parser_except.PROXY_FORBIDDEN, parser_except.PROXY_INVALID, parser_except.REQ_ERROR, parser_except.PROXY_SSL) or e.need_retry: reqParse.is_forbidden = True if local_req_count >= reqParse.retry_count or e.retry_from_first: raise e else: logger.debug(current_log_tag() + traceback.format_exc()) logger.debug(current_log_tag() + '[准备重试][错误由框架抛出][错误码:{0}][count:{1}]'. format(e.code, reqParse.req_count)) continue else: raise e except Exception as e: if reqParse.user_exc: # 抛出用户在函数中抛出的错误 raise e if local_req_count >= reqParse.retry_count: raise e else: continue # 请求中增加 resp 的值 request_template['resp'] = resp # 打印存储抓取结果 self.response_callback(request_template, resp) if reqParse.res_text == 'text': res = resp.text else: res = resp.content try: logger.debug(current_log_tag() + '[抓取结果][ {2} ][ {0} ... ... {1} ]'.format( res[:100], res[-100:], request_template['req'] ['url']).replace('\n', '').replace('\t', '')) except Exception: pass # 如果本地运行,将不执行上传操作 if not self.debug and self.env != "local": md5_key = get_md5(res) verify_task_info = { 'func_name': reqParse.request_func.__name__, 'page_index': page_count, 'retry_count': local_req_count - 1, 'md5_key': md5_key } # 把上传抓取页面至ucloud self.task_post_process_queue.put((res, self.task, md5_key)) self.verify_data['data'].append(verify_task_info) point_time = time.time() * 1000 try: convert_data = reqParse.convert(request_template, res) except Exception: if local_req_count >= reqParse.retry_count: logger.debug(current_log_tag() + traceback.format_exc()) raise parser_except.ParserException( parser_except.DATA_FORMAT_ERROR, '[traceback: {0}]'.format(traceback.format_exc())) else: continue finally: self.__cpu_time += time.time() * 1000 - point_time # 数据解析部分 point_time = time.time() * 1000 try: res = reqParse.parse(request_template, self.__targets_parser_func_dict, convert_data, page_count, self._crawl_targets_required) break except parser_except.ParserException as e: if e.code in (parser_except.PROXY_FORBIDDEN, parser_except.PROXY_INVALID): reqParse.is_forbidden = True if local_req_count >= reqParse.retry_count or e.retry_from_first: raise e else: logger.debug(current_log_tag() + '[准备重试][错误由爬虫抛出][错误码:{0}]'.format(e.code)) convert_data = None continue else: raise e except Exception: raise parser_except.ParserException( parser_except.PARSE_ERROR, '[traceback:{0}]'.format(traceback.format_exc())) finally: self.__cpu_time += time.time() * 1000 - point_time self.response_callback(request_template, resp) have_ticket = False for k, v in res.items(): if not v: continue self._asy_temp_result[k] += v have_ticket = True # 有票 && slave调用的爬虫才会异步回调 if have_ticket and self.process_callback and not self.debug and self.env != "local": self.process_callback(task=self.task, spider=self, result_type="RUNNING") return res