Example #1
0
def hotel_list_database(source,
                        url,
                        required,
                        old_spider_name,
                        need_cache=True):
    try:
        task = Task()
        task.content = urljoin(URL, url)
        logger.info('%s  %s' % (task.content, required))
        task.source = source.lower().capitalize() + 'ListInfo'
        # spider = factory.get_spider('daodao', task.source)
        spider = factory.get_spider_by_old_source('daodao' + old_spider_name)
        spider.task = task
        if need_cache:
            code = spider.crawl(required=[required], cache_config=cache_config)
        else:
            code = spider.crawl(required=[required],
                                cache_config=none_cache_config)

        others_info = {
            'result': spider.result,
            'save_page': json.dumps(spider.save_page),
            'view_page_info': spider.view_page_info,
            'restaurant_page_info': spider.restaurant_page_info
        }

        return code, spider.result.get(
            required, {}), others_info, spider.page_store_key_list
    except Exception as e:
        logger.error(traceback.format_exc(e))
        raise e
def hotel_list_database(source, city_id):
    task = Task()
    task.content = str(city_id) + '&' + '2&{nights}&{check_in}'.format(
        **hotel_rooms)
    spider = factory.get_spider_by_old_source(source + 'ListHotel')
    spider.task = task
    print spider.crawl(required=['hotel'])
    return spider.result
Example #3
0
def hotel_list_database(source, city_id, check_in):
    task = Task()
    task.content = str(city_id) + '&' + '2&1&{0}'.format(check_in)
    task.source = source + 'ListHotel'
    spider = factory.get_spider_by_old_source(task.source)
    spider.task = task
    code = spider.crawl(required=['hotel'])
    return code, spider.result
Example #4
0
def hotel_list_database(source, url):
    task = Task()
    task.content = URL + url
    task.source = source.lower().capitalize() + 'ListInfo'
    spider = factory.get_spider('daodao', task.source)
    # spider = factory.get_spider_by_old_source(task.source)
    # spider = DaodaoViewSpider()
    spider.task = task
    code = spider.crawl(required=['restaurant'])
    return code, spider.result.get('restaurant', {})
Example #5
0
    def parse_task(self):
        result = list()

        qid = self.get_argument('qid')
        tid = self.get_argument('tid')
        uid = self.get_argument('uid')
        type = self.get_argument('type')
        ptid = self.get_argument('ptid')
        role = self.get_argument('role')
        csuid = self.get_argument('csuid')
        ori_type = self.get_argument('ori_type')
        req_list = json.loads(self.get_argument('req'))
        client_ip = self.request.remote_ip

        for req in req_list:
            task = Task()
            task.req_qid = qid
            task.req_uid = uid
            task.order_no = req.get('order_no', '')
            task.source = req['source']
            task.content = req['content']
            task.deadline = req.get('deadline', 0)
            task.debug = req.get('debug', False)
            task.tid = tid
            task.client_ip = client_ip
            task.ori_type = ori_type
            task.ticket_info = req['ticket_info']
            task.verify = req.get('verify', {'type': 'pre', 'set_type': 'E'})
            task.req_md5 = task.ticket_info.get('md5', 'default_md5')

            task.master_info = req.get('master_info', 'default_host')
            task.host = task.master_info.get('master_addr')

            task.redis_host = task.master_info.get('redis_addr').split(':')[0]
            task.redis_port = task.master_info.get('redis_addr').split(':')[-1]

            task.redis_db = task.master_info.get('redis_db')
            task.redis_passwd = task.master_info.get('redis_passwd')

            task.req_qid_md5 = task.req_qid + '-' + task.req_md5
            task.other_info = req.get('other_info', {})

            callback_type = 'scv100'
            if 'callback_type' in task.other_info:
                callback_type = task.other_info['callback_type']

            task.callback_type = callback_type
            redis_key_list = task.other_info.get('redis_key', [])
            # 之前redis_key 会传多个过来,现在只传一个,但保留了list的格式
            for each in redis_key_list:
                task.redis_key = each
                task.other_info['redis_key'] = each
                # logger.info('s[{0}] id[{1}]new verify task:{2}'.format(task.source, task.new_task_id, task))
                yield task
Example #6
0
def hilton_to_database(tid, used_times, source, keyword, extra, spider_tag, need_cache=True):
    task = Task()
    task.content = keyword
    task.extra = extra
    spider = factory.get_spider_by_old_source(spider_tag)
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['suggest'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['suggest'], cache_config=none_cache_config)
    logger.info(
        str(len(spider.result['suggest'])) + '  --  ' + keyword)
    return error_code, spider.result['suggest']
def hotel_tax_detail(self, task_content, city_id, **kwargs):
    try:
        task = Task()
        task.content = task_content
        result = hotel_tax(task, city_id)
        data = result.values()[-1][-1]
        data['task_content'] = task_content
        data['city_id'] = city_id
        table.insert(data)
        if kwargs.get('task_id'):
            update_task(kwargs['task_id'])
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc))
Example #8
0
def hotel_detail_database(url, source, need_cache=True):
    task = Task()
    task.content = url
    spider = factory.get_spider_by_old_source(source + 'DetailHotel')
    spider.task = task
    spider.task.source = source
    if need_cache:
        error_code = spider.crawl(required=['hotel'],
                                  cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['hotel'],
                                  cache_config=none_cache_config)
    logger.info(
        str(task.ticket_info) + '  --  ' + task.content + '--' +
        str(error_code))
    return error_code, spider.result, spider.page_store_key_list
def poidetail_to_database(tid, used_times, source, url, need_cache=True):
    task = Task()
    task.content = url
    task.ticket_info = {
        'tid': tid,
        'used_times': used_times
    }
    print (source + '_detail')
    spider = factory.get_spider_by_old_source(source+'_detail')
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['POIdetail'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['POIdetail'], cache_config=none_cache_config)
    print(error_code)
    logger.info(str(spider.result['POIdetail']) + '  --  ' + task.content)
    return error_code, spider.result['POIdetail'], spider.page_store_key_list
Example #10
0
def hilton_to_database(tid, used_times, source, source_id, city_id, check_in, need_cache=True):
    task = Task()
    task.content = 'NULL&' + str(city_id) + '&' + str(source_id) + '&' + '2&{0}'.format(check_in)
    task.ticket_info = {
        'tid': tid,
        'used_times': used_times,
        'room_info': [{"occ": 2, "num": 1}]
    }
    spider = factory.get_spider_by_old_source('hiltonHotel2')
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['list', 'room'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['list', 'room'], cache_config=none_cache_config)
    print(error_code)
    logger.info(str(spider.result['room']) + '  --  ' + task.content)
    return error_code, spider.result['room'], spider.page_store_key_list
Example #11
0
def qyer_list_to_database(tid,
                          used_times,
                          source,
                          city_id,
                          check_in,
                          city_url,
                          need_cache=True):
    task = Task()
    task.content = city_url
    task.ticket_info = {'tid': tid, 'used_times': used_times}
    spider = factory.get_spider_by_old_source('qyerList')
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['list'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['list'],
                                  cache_config=none_cache_config)
    print(error_code)
    logger.info(str(spider.result['list']) + '  --  ' + task.content)
    return error_code, spider.result[
        'list'], spider.page_store_key_list, spider.types_result_num
def hotel_list_database(source, check_in, suggest_type='1', suggest=''):
    # 初始化任务
    task = Task()
    task.ticket_info = {
        "is_new_type": True,
        "suggest_type": int(suggest_type),
        "suggest": suggest,
        "check_in": str(check_in),
        "stay_nights": '1',
        "occ": '2',
        'is_service_platform': True,
        'tid': uuid.uuid4(),
        'used_times': random.randint(1, 6),
    }
    task.content = ''

    # 初始化 spider
    spider = factory.get_spider_by_old_source(OLD_SOURCE)
    spider.task = task

    # 请求
    error_code = spider.crawl(required=REQUIRED, cache_config=False)

    return error_code, spider.result, spider.page_store_key_list
        self.retry(exc=traceback.format_exc(exc))


@app.task(bind=True, base=BaseTask, max_retries=3, rate_limit='120/s')
def hotel_tax_detail(self, task_content, city_id, **kwargs):
    try:
        task = Task()
        task.content = task_content
        result = hotel_tax(task, city_id)
        data = result.values()[-1][-1]
        data['task_content'] = task_content
        data['city_id'] = city_id
        table.insert(data)
        if kwargs.get('task_id'):
            update_task(kwargs['task_id'])
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc))


if __name__ == '__main__':
    # print hotel_list_database('booking', '51211')
    # print hotel_list_database('expedia', '10001')
    # raise Exception()
    task = Task()
    # hotel_url
    hotel_url = "https://www.expedia.com.hk/cn/Hotels-Hotel-Romance-Malesherbes-By-Patrick-Hayat.h1753932.Hotel-Information?chkin=2017%2F5%2F20&chkout=2017%2F5%2F21&rm1=a2&regionId=0&hwrqCacheKey=95ac5f10-6c82-4163-9959-901ddc9c674aHWRQ1493094040336&vip=false&c=1993f64d-88df-4719-a274-c3cf51ad721f&&exp_dp=885.37&exp_ts=1493094041525&exp_curr=HKD&exp_pg=HSR"
    task.content = hotel_url.split('?')[0] + "?&1&20170910"
    print task.content

    print hotel_tax(task, '10001')
Example #14
0
        # print json.dumps(res,ensure_ascii=False)
        return res


if __name__ == "__main__":
    from mioji.common.task_info import Task
    from mioji.common.utils import simple_get_socks_proxy_new, simple_get_socks_proxy
    from mioji.common import spider
    #
    # spider.slave_get_proxy = simple_get_socks_proxy

    task = Task()
    task.ticket_info = {}
    # task.content = 'https://highlandsinn.hyatt.com/en/hotel/home.html'
    task.content = 'https://kochibolgatty.grand.hyatt.com/en/hotel/home.html'
    task.content = 'https://albuquerqueairport.place.hyatt.com/en/hotel/home.html'
    # task.content = 'https://newyork.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://macae.place.hyatt.com/en/hotel/home.html'
    # task.content = 'https://parisvendome.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://saigon.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://toronto.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://toronto.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://seattledowntown.place.hyatt.com/en/hotel/home.html'
    # task.content = 'https://www.hyatt.com/en-US/hotel/italy/park-hyatt-milan/milph'
    task.content = 'https://www.hyatt.com/en-US/hotel/china/park-hyatt-shanghai/shaph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/france/park-hyatt-paris-vendome/parph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/cambodia/park-hyatt-siem-reap/repph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/vietnam/park-hyatt-saigon/saiph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/china/park-hyatt-shanghai/shaph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/saint-kitts-and-nevis/park-hyatt-st-kitts/skbph'
Example #15
0
    #     'room_info': [{"adult_info": [33, 44], "child_info": [9, 5]}],
    #     "auth": auth,
    #     'room_count': 1,
    #     # "verify_room": ["DOUBLE CITY VIEW TWO QUEEN BEDS"]
    # }
    task.redis_key = 'asdfasdfasdf'
    # 测试数据,美国 加州 奥克兰 滨水杰德微精品酒店
    # task.content = '13000&28333&3&20170809'
    # task.content = '13000&28333&3&20180610'
    # task.content = "30095&64958&1&20180905"

    for content in [
            '20977&39773&1&20190531', '20150&18131&2&20190508',
            '20070&3965&2&20190510', 'NULL&218279&3&20190531'
    ]:
        task.content = content
        task.ticket_info = {
            "room_info": [{
                "adult_info": [24],
                "child_info": [5]
            }, {
                "adult_info": [24],
                "child_info": [5]
            }],
            "auth":
            auth,
            "age_info": [["19960815", "20060815"]]
        }

        spider = DaolvSpider()
        spider.task = task
            res = json.loads(res)
            return res


if __name__ == '__main__':
    from mioji.common.task_info import Task
    from mioji.common import spider
    from mioji.common.utils import simple_get_socks_proxy_new
    spider.slave_get_proxy = simple_get_socks_proxy_new

    task = Task()
    spider = ShangRiLaDetailSpider()
    spider.task = task

    task.content = 'http://www.shangri-la.com/cn/jinan/shangrila/&济南香格里拉大酒店&SLJI&中国大陆&'

    spider.crawl()
    print spider.code
    res = json.dumps(spider.result, ensure_ascii=False)

    print res

    # v_list = []
    # k_list = []
    # for k, v in res.items():
    #     pass

    # dateframe = pd.
    # import codecs
    # f = codecs.open('a.csv', 'a+', encoding='utf-8')
Example #17
0
def hotel_list_database(tid,
                        used_times,
                        source,
                        city_id,
                        check_in,
                        is_new_type=False,
                        suggest_type='1',
                        suggest='',
                        need_cache=True,
                        flag=False):
    task = Task()
    task.source = source
    if not is_new_type:
        if source == 'hilton':
            task.content = check_in
        elif source == 'starwood':
            task.content = suggest + '&'
        elif source in ['hyatt']:
            task.content = ''
        elif source == 'gha':
            task.content = suggest
        else:
            task.content = str(city_id) + '&' + '2&1&{0}'.format(check_in)

        task.ticket_info = {
            "is_new_type": False,
            'is_service_platform': True,
            'tid': tid,
            'used_times': used_times
        }
        if source == 'bestwest':
            description = suggest.split('&')[0]
            map_info = suggest.split('&')[1]
            map_info = map_info.split(',')
            task.content = '&{}&{}&2'.format(description, check_in)
            task.ticket_info = {
                'locationLng': float(map_info[0]),
                'locationLat': float(map_info[1])
            }
            # task.content = '&印度喀拉拉邦恰拉库德伊&20180525&2'
            # task.ticket_info = {'locationLng': '13.404954', 'locationLat': '52.5200066'}
    else:
        task.ticket_info = {
            "is_new_type": True,
            "suggest_type": int(suggest_type),
            "suggest": suggest,
            "check_in": str(check_in),
            "stay_nights": '1',
            "city_id": city_id,
            "occ": '2',
            'is_service_platform': True,
            'tid': tid,
            'used_times': used_times,
        }
        task.content = ''
    print(task.ticket_info)
    if flag:
        old_spider_tag = source + 'FilterHotel'
        required = ['filter']
    else:
        old_spider_tag = source + 'ListHotel'
        required = ['hotel']
    spider = factory.get_spider_by_old_source(old_spider_tag)
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=required, cache_config=cache_config)
    else:
        error_code = spider.crawl(required=required,
                                  cache_config=none_cache_config)
    # logger.info(str(task.ticket_info) + '  --  ' + '-'+str(error_code)+'-' +task.content)
    # logger.info(str(spider.result['hotel'][:100]))
    return error_code, spider.result, spider.page_store_key_list