def hotel_list_database(source, url, required, old_spider_name, need_cache=True): try: task = Task() task.content = urljoin(URL, url) logger.info('%s %s' % (task.content, required)) task.source = source.lower().capitalize() + 'ListInfo' # spider = factory.get_spider('daodao', task.source) spider = factory.get_spider_by_old_source('daodao' + old_spider_name) spider.task = task if need_cache: code = spider.crawl(required=[required], cache_config=cache_config) else: code = spider.crawl(required=[required], cache_config=none_cache_config) others_info = { 'result': spider.result, 'save_page': json.dumps(spider.save_page), 'view_page_info': spider.view_page_info, 'restaurant_page_info': spider.restaurant_page_info } return code, spider.result.get( required, {}), others_info, spider.page_store_key_list except Exception as e: logger.error(traceback.format_exc(e)) raise e
def hotel_list_database(source, city_id): task = Task() task.content = str(city_id) + '&' + '2&{nights}&{check_in}'.format( **hotel_rooms) spider = factory.get_spider_by_old_source(source + 'ListHotel') spider.task = task print spider.crawl(required=['hotel']) return spider.result
def hotel_list_database(source, city_id, check_in): task = Task() task.content = str(city_id) + '&' + '2&1&{0}'.format(check_in) task.source = source + 'ListHotel' spider = factory.get_spider_by_old_source(task.source) spider.task = task code = spider.crawl(required=['hotel']) return code, spider.result
def hotel_list_database(source, url): task = Task() task.content = URL + url task.source = source.lower().capitalize() + 'ListInfo' spider = factory.get_spider('daodao', task.source) # spider = factory.get_spider_by_old_source(task.source) # spider = DaodaoViewSpider() spider.task = task code = spider.crawl(required=['restaurant']) return code, spider.result.get('restaurant', {})
def hilton_to_database(tid, used_times, source, keyword, extra, spider_tag, need_cache=True): task = Task() task.content = keyword task.extra = extra spider = factory.get_spider_by_old_source(spider_tag) spider.task = task if need_cache: error_code = spider.crawl(required=['suggest'], cache_config=cache_config) else: error_code = spider.crawl(required=['suggest'], cache_config=none_cache_config) logger.info( str(len(spider.result['suggest'])) + ' -- ' + keyword) return error_code, spider.result['suggest']
def hotel_tax_detail(self, task_content, city_id, **kwargs): try: task = Task() task.content = task_content result = hotel_tax(task, city_id) data = result.values()[-1][-1] data['task_content'] = task_content data['city_id'] = city_id table.insert(data) if kwargs.get('task_id'): update_task(kwargs['task_id']) except Exception as exc: self.retry(exc=traceback.format_exc(exc))
def hotel_url_to_database(source, keyword, need_cache=False): task = Task() task.ticket_info['url'] = keyword task.ticket_info['hotel_name'] = keyword old_target = source + 'ListHotel' spider = factory.get_spider_by_old_source(old_target) spider.task = task if need_cache: error_code = spider.crawl(required=['hotel'], cache_config=cache_config) else: error_code = spider.crawl(required=['hotel'], cache_config=none_cache_config) print(error_code) # if data_from == 'google': # return error_code,spider.result,spider.user_datas['search_result'] # print spider.result['hotel'] return error_code, spider.result['hotel']
def hotel_detail_database(url, source, need_cache=True): task = Task() task.content = url spider = factory.get_spider_by_old_source(source + 'DetailHotel') spider.task = task spider.task.source = source if need_cache: error_code = spider.crawl(required=['hotel'], cache_config=cache_config) else: error_code = spider.crawl(required=['hotel'], cache_config=none_cache_config) logger.info( str(task.ticket_info) + ' -- ' + task.content + '--' + str(error_code)) return error_code, spider.result, spider.page_store_key_list
def hilton_to_database(tid, used_times, source, source_id, city_id, check_in, need_cache=True): task = Task() task.content = 'NULL&' + str(city_id) + '&' + str(source_id) + '&' + '2&{0}'.format(check_in) task.ticket_info = { 'tid': tid, 'used_times': used_times, 'room_info': [{"occ": 2, "num": 1}] } spider = factory.get_spider_by_old_source('hiltonHotel2') spider.task = task if need_cache: error_code = spider.crawl(required=['list', 'room'], cache_config=cache_config) else: error_code = spider.crawl(required=['list', 'room'], cache_config=none_cache_config) print(error_code) logger.info(str(spider.result['room']) + ' -- ' + task.content) return error_code, spider.result['room'], spider.page_store_key_list
def poidetail_to_database(tid, used_times, source, url, need_cache=True): task = Task() task.content = url task.ticket_info = { 'tid': tid, 'used_times': used_times } print (source + '_detail') spider = factory.get_spider_by_old_source(source+'_detail') spider.task = task if need_cache: error_code = spider.crawl(required=['POIdetail'], cache_config=cache_config) else: error_code = spider.crawl(required=['POIdetail'], cache_config=none_cache_config) print(error_code) logger.info(str(spider.result['POIdetail']) + ' -- ' + task.content) return error_code, spider.result['POIdetail'], spider.page_store_key_list
def GTdetail_to_database(tid, used_times, source, ticket, need_cache=True): task = Task() task.ticket_info = { 'tid': tid, 'vacation_info': ticket, 'source': source, 'used_times': used_times } spider = factory.get_spider_by_old_source( '{}|vacation_detail'.format(source)) spider.task = task if need_cache: error_code = spider.crawl(required=['vacation'], cache_config=cache_config) else: error_code = spider.crawl(required=['vacation'], cache_config=none_cache_config) print(error_code) # logger.info(str(spider.result['vacation']) + ' -- ' + task.ticket_info['vacation_info']['url']) return error_code, spider.result['vacation'], spider.page_store_key_list
def qyer_list_to_database(tid, used_times, source, city_id, check_in, city_url, need_cache=True): task = Task() task.content = city_url task.ticket_info = {'tid': tid, 'used_times': used_times} spider = factory.get_spider_by_old_source('qyerList') spider.task = task if need_cache: error_code = spider.crawl(required=['list'], cache_config=cache_config) else: error_code = spider.crawl(required=['list'], cache_config=none_cache_config) print(error_code) logger.info(str(spider.result['list']) + ' -- ' + task.content) return error_code, spider.result[ 'list'], spider.page_store_key_list, spider.types_result_num
def hotel_list_database(source, check_in, suggest_type='1', suggest=''): # 初始化任务 task = Task() task.ticket_info = { "is_new_type": True, "suggest_type": int(suggest_type), "suggest": suggest, "check_in": str(check_in), "stay_nights": '1', "occ": '2', 'is_service_platform': True, 'tid': uuid.uuid4(), 'used_times': random.randint(1, 6), } task.content = '' # 初始化 spider spider = factory.get_spider_by_old_source(OLD_SOURCE) spider.task = task # 请求 error_code = spider.crawl(required=REQUIRED, cache_config=False) return error_code, spider.result, spider.page_store_key_list
def hotel_url_to_database(tid, used_times, source, keyword, spider_tag, need_cache=False): task = Task() task.ticket_info['hotel_name'] = keyword spider = factory.get_spider_by_old_source(spider_tag) spider.task = task error_code = spider.crawl(required=['hotel'], cache_config=none_cache_config) tem_dic = spider.result if len(spider.result['hotel']) <= 2: task2 = Task() task2.ticket_info['hotel_name'] = keyword spider2 = factory.get_spider_by_old_source(spider_tag) spider2.task = task2 error_code2 = spider2.crawl(required=['hotel'], cache_config=none_cache_config) for j in spider2.result['hotel']: tem_dic['hotel'].append(j) return error_code, tem_dic, spider.user_datas['search_result']
return tasks room = room_info[0] if 'occ' not in room: return tasks occ = int(room["occ"]) num = int(room["num"]) adults = [] new_room = [] for i in range(occ): adults.append(25) for i in range(num): new_room.append({"adult_info": adults, "child_info": []}) tasks.ticket_info["room_info"] = new_room return tasks if __name__ == '__main__': from mioji.common.task_info import Task task = Task() task.ticket_info['room_info'] = [{ "adult_info": [ 33, ], "child_info": [7, 2] }, { "adult_info": [33, 22], "child_info": [2, 3] }] print task_change_sass(task)
self.retry(exc=traceback.format_exc(exc)) @app.task(bind=True, base=BaseTask, max_retries=3, rate_limit='120/s') def hotel_tax_detail(self, task_content, city_id, **kwargs): try: task = Task() task.content = task_content result = hotel_tax(task, city_id) data = result.values()[-1][-1] data['task_content'] = task_content data['city_id'] = city_id table.insert(data) if kwargs.get('task_id'): update_task(kwargs['task_id']) except Exception as exc: self.retry(exc=traceback.format_exc(exc)) if __name__ == '__main__': # print hotel_list_database('booking', '51211') # print hotel_list_database('expedia', '10001') # raise Exception() task = Task() # hotel_url hotel_url = "https://www.expedia.com.hk/cn/Hotels-Hotel-Romance-Malesherbes-By-Patrick-Hayat.h1753932.Hotel-Information?chkin=2017%2F5%2F20&chkout=2017%2F5%2F21&rm1=a2®ionId=0&hwrqCacheKey=95ac5f10-6c82-4163-9959-901ddc9c674aHWRQ1493094040336&vip=false&c=1993f64d-88df-4719-a274-c3cf51ad721f&&exp_dp=885.37&exp_ts=1493094041525&exp_curr=HKD&exp_pg=HSR" task.content = hotel_url.split('?')[0] + "?&1&20170910" print task.content print hotel_tax(task, '10001')
def get_proxy( source=None, allow_ports=[], forbid_ports=[], allow_regions=[], forbid_regions=[], user='******', passwd='realtime', proxy_info={}, verify_info="verify", ip_num=1, ip_type="internal", task=Task(), ): """ 全都需要取代理暂时 """ qid = str(task.ticket_info.get('qid', int(time.time() * 1000))) msg = { "req": [{ "source": source, "type": verify_info, "num": ip_num, "ip_type": ip_type, }] } msg = json.dumps(msg) ptid = task.ticket_info.get('ptid', "") time_st = time.time() get_info = '/?type=px001&qid={0}&query={1}&ptid={2}&tid=tid&ccy=AUD'.format( qid, msg, ptid) logger.info("get proxy info :http://{1}{0}".format(get_info, g_config.proxy_host)) count = 1 while 1: try: p = requests.get("http://{0}".format(g_config.proxy_host) + get_info, timeout=(6, 6), stream=False) p_time = p.elapsed.total_seconds() p = p.content logger.info("代理返回内容为{0}".format(p)) proxy_ip = json.loads(p)['resp'][0]['ips'][0]['inner_ip'] break except: exstr = traceback.format_exc() msg = '取代理请求时报错,错误信息为:' + exstr info = warn(qid, 'ex_GetProxyFail', ip, msg) logger.debug("\n" + info) if count == 3: raise parser_except.ParserException(21, "取代理时失败") time.sleep(3) logger.debug("取代理失败,进行第{}次重试,".format(count)) count += 1 time_end = time.time() - time_st # 代理服务有时候会返回一个只有":"的代理! if len(proxy_ip) < 9: msg = "获取到的代理不可用!" info = warn(qid, 'ex_GetProxyFail', ip, msg) logger.debug("\n" + info) raise parser_except.ParserException(21, "获取到的代理有误:{}".format(p)) if not proxy_ip: msg = '未获取到代理,请求信息为:' + get_info info = warn(qid, 'ex_GetProxyFail', ip, msg) logger.debug("\n" + info) raise parser_except.ParserException(21, "未获取到代理") if p_time > 1.5: msg = '获取代理成功耗时, 耗时:{0}, requests 记录超时时间:{1}'.format(time_end, p_time) info = warn(qid, 'ex_GetProxyFail', ip, msg) logger.debug("\n" + info) p = [proxy_ip, [p, time_end, get_info]] return p
def hotel_list_database(tid, used_times, source, city_id, check_in, is_new_type=False, suggest_type='1', suggest='', need_cache=True, flag=False): task = Task() task.source = source if not is_new_type: if source == 'hilton': task.content = check_in elif source == 'starwood': task.content = suggest + '&' elif source in ['hyatt']: task.content = '' elif source == 'gha': task.content = suggest else: task.content = str(city_id) + '&' + '2&1&{0}'.format(check_in) task.ticket_info = { "is_new_type": False, 'is_service_platform': True, 'tid': tid, 'used_times': used_times } if source == 'bestwest': description = suggest.split('&')[0] map_info = suggest.split('&')[1] map_info = map_info.split(',') task.content = '&{}&{}&2'.format(description, check_in) task.ticket_info = { 'locationLng': float(map_info[0]), 'locationLat': float(map_info[1]) } # task.content = '&印度喀拉拉邦恰拉库德伊&20180525&2' # task.ticket_info = {'locationLng': '13.404954', 'locationLat': '52.5200066'} else: task.ticket_info = { "is_new_type": True, "suggest_type": int(suggest_type), "suggest": suggest, "check_in": str(check_in), "stay_nights": '1', "city_id": city_id, "occ": '2', 'is_service_platform': True, 'tid': tid, 'used_times': used_times, } task.content = '' print(task.ticket_info) if flag: old_spider_tag = source + 'FilterHotel' required = ['filter'] else: old_spider_tag = source + 'ListHotel' required = ['hotel'] spider = factory.get_spider_by_old_source(old_spider_tag) spider.task = task if need_cache: error_code = spider.crawl(required=required, cache_config=cache_config) else: error_code = spider.crawl(required=required, cache_config=none_cache_config) # logger.info(str(task.ticket_info) + ' -- ' + '-'+str(error_code)+'-' +task.content) # logger.info(str(spider.result['hotel'][:100])) return error_code, spider.result, spider.page_store_key_list
else: self.hotel.Img_first = self.img_first res = self.hotel.to_dict() res = json.loads(res) return res if __name__ == '__main__': from mioji.common.task_info import Task from mioji.common import spider from mioji.common.utils import simple_get_socks_proxy_new spider.slave_get_proxy = simple_get_socks_proxy_new task = Task() spider = ShangRiLaDetailSpider() spider.task = task task.content = 'http://www.shangri-la.com/cn/jinan/shangrila/&济南香格里拉大酒店&SLJI&中国大陆&' spider.crawl() print spider.code res = json.dumps(spider.result, ensure_ascii=False) print res # v_list = [] # k_list = [] # for k, v in res.items(): # pass
def parse_task(self): result = list() qid = self.get_argument('qid') tid = self.get_argument('tid') uid = self.get_argument('uid') type = self.get_argument('type') ptid = self.get_argument('ptid') role = self.get_argument('role') csuid = self.get_argument('csuid') ori_type = self.get_argument('ori_type') req_list = json.loads(self.get_argument('req')) client_ip = self.request.remote_ip for req in req_list: task = Task() task.req_qid = qid task.req_uid = uid task.order_no = req.get('order_no', '') task.source = req['source'] task.content = req['content'] task.deadline = req.get('deadline', 0) task.debug = req.get('debug', False) task.tid = tid task.client_ip = client_ip task.ori_type = ori_type task.ticket_info = req['ticket_info'] task.verify = req.get('verify', {'type': 'pre', 'set_type': 'E'}) task.req_md5 = task.ticket_info.get('md5', 'default_md5') task.master_info = req.get('master_info', 'default_host') task.host = task.master_info.get('master_addr') task.redis_host = task.master_info.get('redis_addr').split(':')[0] task.redis_port = task.master_info.get('redis_addr').split(':')[-1] task.redis_db = task.master_info.get('redis_db') task.redis_passwd = task.master_info.get('redis_passwd') task.req_qid_md5 = task.req_qid + '-' + task.req_md5 task.other_info = req.get('other_info', {}) callback_type = 'scv100' if 'callback_type' in task.other_info: callback_type = task.other_info['callback_type'] task.callback_type = callback_type redis_key_list = task.other_info.get('redis_key', []) # 之前redis_key 会传多个过来,现在只传一个,但保留了list的格式 for each in redis_key_list: task.redis_key = each task.other_info['redis_key'] = each # logger.info('s[{0}] id[{1}]new verify task:{2}'.format(task.source, task.new_task_id, task)) yield task
# return hotels res = hotel.to_dict() res = json.loads(res) # print json.dumps(res,ensure_ascii=False) return res if __name__ == "__main__": from mioji.common.task_info import Task from mioji.common.utils import simple_get_socks_proxy_new, simple_get_socks_proxy from mioji.common import spider # # spider.slave_get_proxy = simple_get_socks_proxy task = Task() task.ticket_info = {} # task.content = 'https://highlandsinn.hyatt.com/en/hotel/home.html' task.content = 'https://kochibolgatty.grand.hyatt.com/en/hotel/home.html' task.content = 'https://albuquerqueairport.place.hyatt.com/en/hotel/home.html' # task.content = 'https://newyork.park.hyatt.com/en/hotel/home.html' # task.content = 'https://macae.place.hyatt.com/en/hotel/home.html' # task.content = 'https://parisvendome.park.hyatt.com/en/hotel/home.html' # task.content = 'https://saigon.park.hyatt.com/en/hotel/home.html' # task.content = 'https://toronto.park.hyatt.com/en/hotel/home.html' # task.content = 'https://toronto.park.hyatt.com/en/hotel/home.html' # task.content = 'https://seattledowntown.place.hyatt.com/en/hotel/home.html' # task.content = 'https://www.hyatt.com/en-US/hotel/italy/park-hyatt-milan/milph' task.content = 'https://www.hyatt.com/en-US/hotel/china/park-hyatt-shanghai/shaph' # task.content = 'https://www.hyatt.com/en-US/hotel/france/park-hyatt-paris-vendome/parph' # task.content = 'https://www.hyatt.com/en-US/hotel/cambodia/park-hyatt-siem-reap/repph'
message = re.search(r'<Message>(.*)</Message>', resp.text).group(1) except Exception: message = "" raise parser_except.ParserException(29, message) raise parser_except.ParserException( 89, "服务出错啦啊啊~~!http code: {}".format(req['resp'].status_code)) def utc_to_local(utc_time_str="2019-01-11", utc_format='%Y-%m-%d %H:%M:%S'): utc_time = utc_time_str.split("T")[0] + " 00:00:00" return time.mktime(time.strptime(utc_time, utc_format)) if __name__ == '__main__': task = Task() task.source = 'daolv hotel' auth = json.dumps({ "acc_mj_uid": "daolv_001", "ClientID": "Mioji", "LicenseKey": "Mioji", "url": "http://api.didatravel.com", "apienv": "test" }) # auth = json.dumps(auth) # task.ticket_info = {'env_name': 'test', "room_info": {"num": 2, "occ": 2}, "auth": auth, 'room_count': 1} # task.ticket_info = { # 'env_name': 'test', # 'room_info': [{"adult_info": [33, 44], "child_info": [9, 5]}], # "auth": auth, # 'room_count': 1,