def hotel_list_database(source, url, required, old_spider_name, need_cache=True): try: task = Task() task.content = urljoin(URL, url) logger.info('%s %s' % (task.content, required)) task.source = source.lower().capitalize() + 'ListInfo' # spider = factory.get_spider('daodao', task.source) spider = factory.get_spider_by_old_source('daodao' + old_spider_name) spider.task = task if need_cache: code = spider.crawl(required=[required], cache_config=cache_config) else: code = spider.crawl(required=[required], cache_config=none_cache_config) others_info = { 'result': spider.result, 'save_page': json.dumps(spider.save_page), 'view_page_info': spider.view_page_info, 'restaurant_page_info': spider.restaurant_page_info } return code, spider.result.get( required, {}), others_info, spider.page_store_key_list except Exception as e: logger.error(traceback.format_exc(e)) raise e
def hotel_list_database(source, city_id): task = Task() task.content = str(city_id) + '&' + '2&{nights}&{check_in}'.format( **hotel_rooms) spider = factory.get_spider_by_old_source(source + 'ListHotel') spider.task = task print spider.crawl(required=['hotel']) return spider.result
def hotel_list_database(source, city_id, check_in): task = Task() task.content = str(city_id) + '&' + '2&1&{0}'.format(check_in) task.source = source + 'ListHotel' spider = factory.get_spider_by_old_source(task.source) spider.task = task code = spider.crawl(required=['hotel']) return code, spider.result
def hotel_list_database(source, url): task = Task() task.content = URL + url task.source = source.lower().capitalize() + 'ListInfo' spider = factory.get_spider('daodao', task.source) # spider = factory.get_spider_by_old_source(task.source) # spider = DaodaoViewSpider() spider.task = task code = spider.crawl(required=['restaurant']) return code, spider.result.get('restaurant', {})
def parse_task(self): result = list() qid = self.get_argument('qid') tid = self.get_argument('tid') uid = self.get_argument('uid') type = self.get_argument('type') ptid = self.get_argument('ptid') role = self.get_argument('role') csuid = self.get_argument('csuid') ori_type = self.get_argument('ori_type') req_list = json.loads(self.get_argument('req')) client_ip = self.request.remote_ip for req in req_list: task = Task() task.req_qid = qid task.req_uid = uid task.order_no = req.get('order_no', '') task.source = req['source'] task.content = req['content'] task.deadline = req.get('deadline', 0) task.debug = req.get('debug', False) task.tid = tid task.client_ip = client_ip task.ori_type = ori_type task.ticket_info = req['ticket_info'] task.verify = req.get('verify', {'type': 'pre', 'set_type': 'E'}) task.req_md5 = task.ticket_info.get('md5', 'default_md5') task.master_info = req.get('master_info', 'default_host') task.host = task.master_info.get('master_addr') task.redis_host = task.master_info.get('redis_addr').split(':')[0] task.redis_port = task.master_info.get('redis_addr').split(':')[-1] task.redis_db = task.master_info.get('redis_db') task.redis_passwd = task.master_info.get('redis_passwd') task.req_qid_md5 = task.req_qid + '-' + task.req_md5 task.other_info = req.get('other_info', {}) callback_type = 'scv100' if 'callback_type' in task.other_info: callback_type = task.other_info['callback_type'] task.callback_type = callback_type redis_key_list = task.other_info.get('redis_key', []) # 之前redis_key 会传多个过来,现在只传一个,但保留了list的格式 for each in redis_key_list: task.redis_key = each task.other_info['redis_key'] = each # logger.info('s[{0}] id[{1}]new verify task:{2}'.format(task.source, task.new_task_id, task)) yield task
def hilton_to_database(tid, used_times, source, keyword, extra, spider_tag, need_cache=True): task = Task() task.content = keyword task.extra = extra spider = factory.get_spider_by_old_source(spider_tag) spider.task = task if need_cache: error_code = spider.crawl(required=['suggest'], cache_config=cache_config) else: error_code = spider.crawl(required=['suggest'], cache_config=none_cache_config) logger.info( str(len(spider.result['suggest'])) + ' -- ' + keyword) return error_code, spider.result['suggest']
def hotel_tax_detail(self, task_content, city_id, **kwargs): try: task = Task() task.content = task_content result = hotel_tax(task, city_id) data = result.values()[-1][-1] data['task_content'] = task_content data['city_id'] = city_id table.insert(data) if kwargs.get('task_id'): update_task(kwargs['task_id']) except Exception as exc: self.retry(exc=traceback.format_exc(exc))
def hotel_detail_database(url, source, need_cache=True): task = Task() task.content = url spider = factory.get_spider_by_old_source(source + 'DetailHotel') spider.task = task spider.task.source = source if need_cache: error_code = spider.crawl(required=['hotel'], cache_config=cache_config) else: error_code = spider.crawl(required=['hotel'], cache_config=none_cache_config) logger.info( str(task.ticket_info) + ' -- ' + task.content + '--' + str(error_code)) return error_code, spider.result, spider.page_store_key_list
def poidetail_to_database(tid, used_times, source, url, need_cache=True): task = Task() task.content = url task.ticket_info = { 'tid': tid, 'used_times': used_times } print (source + '_detail') spider = factory.get_spider_by_old_source(source+'_detail') spider.task = task if need_cache: error_code = spider.crawl(required=['POIdetail'], cache_config=cache_config) else: error_code = spider.crawl(required=['POIdetail'], cache_config=none_cache_config) print(error_code) logger.info(str(spider.result['POIdetail']) + ' -- ' + task.content) return error_code, spider.result['POIdetail'], spider.page_store_key_list
def hilton_to_database(tid, used_times, source, source_id, city_id, check_in, need_cache=True): task = Task() task.content = 'NULL&' + str(city_id) + '&' + str(source_id) + '&' + '2&{0}'.format(check_in) task.ticket_info = { 'tid': tid, 'used_times': used_times, 'room_info': [{"occ": 2, "num": 1}] } spider = factory.get_spider_by_old_source('hiltonHotel2') spider.task = task if need_cache: error_code = spider.crawl(required=['list', 'room'], cache_config=cache_config) else: error_code = spider.crawl(required=['list', 'room'], cache_config=none_cache_config) print(error_code) logger.info(str(spider.result['room']) + ' -- ' + task.content) return error_code, spider.result['room'], spider.page_store_key_list
def qyer_list_to_database(tid, used_times, source, city_id, check_in, city_url, need_cache=True): task = Task() task.content = city_url task.ticket_info = {'tid': tid, 'used_times': used_times} spider = factory.get_spider_by_old_source('qyerList') spider.task = task if need_cache: error_code = spider.crawl(required=['list'], cache_config=cache_config) else: error_code = spider.crawl(required=['list'], cache_config=none_cache_config) print(error_code) logger.info(str(spider.result['list']) + ' -- ' + task.content) return error_code, spider.result[ 'list'], spider.page_store_key_list, spider.types_result_num
def hotel_list_database(source, check_in, suggest_type='1', suggest=''): # 初始化任务 task = Task() task.ticket_info = { "is_new_type": True, "suggest_type": int(suggest_type), "suggest": suggest, "check_in": str(check_in), "stay_nights": '1', "occ": '2', 'is_service_platform': True, 'tid': uuid.uuid4(), 'used_times': random.randint(1, 6), } task.content = '' # 初始化 spider spider = factory.get_spider_by_old_source(OLD_SOURCE) spider.task = task # 请求 error_code = spider.crawl(required=REQUIRED, cache_config=False) return error_code, spider.result, spider.page_store_key_list
self.retry(exc=traceback.format_exc(exc)) @app.task(bind=True, base=BaseTask, max_retries=3, rate_limit='120/s') def hotel_tax_detail(self, task_content, city_id, **kwargs): try: task = Task() task.content = task_content result = hotel_tax(task, city_id) data = result.values()[-1][-1] data['task_content'] = task_content data['city_id'] = city_id table.insert(data) if kwargs.get('task_id'): update_task(kwargs['task_id']) except Exception as exc: self.retry(exc=traceback.format_exc(exc)) if __name__ == '__main__': # print hotel_list_database('booking', '51211') # print hotel_list_database('expedia', '10001') # raise Exception() task = Task() # hotel_url hotel_url = "https://www.expedia.com.hk/cn/Hotels-Hotel-Romance-Malesherbes-By-Patrick-Hayat.h1753932.Hotel-Information?chkin=2017%2F5%2F20&chkout=2017%2F5%2F21&rm1=a2®ionId=0&hwrqCacheKey=95ac5f10-6c82-4163-9959-901ddc9c674aHWRQ1493094040336&vip=false&c=1993f64d-88df-4719-a274-c3cf51ad721f&&exp_dp=885.37&exp_ts=1493094041525&exp_curr=HKD&exp_pg=HSR" task.content = hotel_url.split('?')[0] + "?&1&20170910" print task.content print hotel_tax(task, '10001')
# print json.dumps(res,ensure_ascii=False) return res if __name__ == "__main__": from mioji.common.task_info import Task from mioji.common.utils import simple_get_socks_proxy_new, simple_get_socks_proxy from mioji.common import spider # # spider.slave_get_proxy = simple_get_socks_proxy task = Task() task.ticket_info = {} # task.content = 'https://highlandsinn.hyatt.com/en/hotel/home.html' task.content = 'https://kochibolgatty.grand.hyatt.com/en/hotel/home.html' task.content = 'https://albuquerqueairport.place.hyatt.com/en/hotel/home.html' # task.content = 'https://newyork.park.hyatt.com/en/hotel/home.html' # task.content = 'https://macae.place.hyatt.com/en/hotel/home.html' # task.content = 'https://parisvendome.park.hyatt.com/en/hotel/home.html' # task.content = 'https://saigon.park.hyatt.com/en/hotel/home.html' # task.content = 'https://toronto.park.hyatt.com/en/hotel/home.html' # task.content = 'https://toronto.park.hyatt.com/en/hotel/home.html' # task.content = 'https://seattledowntown.place.hyatt.com/en/hotel/home.html' # task.content = 'https://www.hyatt.com/en-US/hotel/italy/park-hyatt-milan/milph' task.content = 'https://www.hyatt.com/en-US/hotel/china/park-hyatt-shanghai/shaph' # task.content = 'https://www.hyatt.com/en-US/hotel/france/park-hyatt-paris-vendome/parph' # task.content = 'https://www.hyatt.com/en-US/hotel/cambodia/park-hyatt-siem-reap/repph' # task.content = 'https://www.hyatt.com/en-US/hotel/vietnam/park-hyatt-saigon/saiph' # task.content = 'https://www.hyatt.com/en-US/hotel/china/park-hyatt-shanghai/shaph' # task.content = 'https://www.hyatt.com/en-US/hotel/saint-kitts-and-nevis/park-hyatt-st-kitts/skbph'
# 'room_info': [{"adult_info": [33, 44], "child_info": [9, 5]}], # "auth": auth, # 'room_count': 1, # # "verify_room": ["DOUBLE CITY VIEW TWO QUEEN BEDS"] # } task.redis_key = 'asdfasdfasdf' # 测试数据,美国 加州 奥克兰 滨水杰德微精品酒店 # task.content = '13000&28333&3&20170809' # task.content = '13000&28333&3&20180610' # task.content = "30095&64958&1&20180905" for content in [ '20977&39773&1&20190531', '20150&18131&2&20190508', '20070&3965&2&20190510', 'NULL&218279&3&20190531' ]: task.content = content task.ticket_info = { "room_info": [{ "adult_info": [24], "child_info": [5] }, { "adult_info": [24], "child_info": [5] }], "auth": auth, "age_info": [["19960815", "20060815"]] } spider = DaolvSpider() spider.task = task
res = json.loads(res) return res if __name__ == '__main__': from mioji.common.task_info import Task from mioji.common import spider from mioji.common.utils import simple_get_socks_proxy_new spider.slave_get_proxy = simple_get_socks_proxy_new task = Task() spider = ShangRiLaDetailSpider() spider.task = task task.content = 'http://www.shangri-la.com/cn/jinan/shangrila/&济南香格里拉大酒店&SLJI&中国大陆&' spider.crawl() print spider.code res = json.dumps(spider.result, ensure_ascii=False) print res # v_list = [] # k_list = [] # for k, v in res.items(): # pass # dateframe = pd. # import codecs # f = codecs.open('a.csv', 'a+', encoding='utf-8')
def hotel_list_database(tid, used_times, source, city_id, check_in, is_new_type=False, suggest_type='1', suggest='', need_cache=True, flag=False): task = Task() task.source = source if not is_new_type: if source == 'hilton': task.content = check_in elif source == 'starwood': task.content = suggest + '&' elif source in ['hyatt']: task.content = '' elif source == 'gha': task.content = suggest else: task.content = str(city_id) + '&' + '2&1&{0}'.format(check_in) task.ticket_info = { "is_new_type": False, 'is_service_platform': True, 'tid': tid, 'used_times': used_times } if source == 'bestwest': description = suggest.split('&')[0] map_info = suggest.split('&')[1] map_info = map_info.split(',') task.content = '&{}&{}&2'.format(description, check_in) task.ticket_info = { 'locationLng': float(map_info[0]), 'locationLat': float(map_info[1]) } # task.content = '&印度喀拉拉邦恰拉库德伊&20180525&2' # task.ticket_info = {'locationLng': '13.404954', 'locationLat': '52.5200066'} else: task.ticket_info = { "is_new_type": True, "suggest_type": int(suggest_type), "suggest": suggest, "check_in": str(check_in), "stay_nights": '1', "city_id": city_id, "occ": '2', 'is_service_platform': True, 'tid': tid, 'used_times': used_times, } task.content = '' print(task.ticket_info) if flag: old_spider_tag = source + 'FilterHotel' required = ['filter'] else: old_spider_tag = source + 'ListHotel' required = ['hotel'] spider = factory.get_spider_by_old_source(old_spider_tag) spider.task = task if need_cache: error_code = spider.crawl(required=required, cache_config=cache_config) else: error_code = spider.crawl(required=required, cache_config=none_cache_config) # logger.info(str(task.ticket_info) + ' -- ' + '-'+str(error_code)+'-' +task.content) # logger.info(str(spider.result['hotel'][:100])) return error_code, spider.result, spider.page_store_key_list