def crawl(self): location = get_area_page(self.address, self.lat, self.lng) if not location or '404' in location: logger.error(u'{} not find location page.'.format(self.address)) return logger.info(u'{} location page: {}'.format(self.address, location)) headers = [ 'host="waimai.meituan.com"', 'referer="http://waimai.meituan.com/"', 'user-agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"' ] browser = get_headless_chrome(headers) browser.get(location) browser.delete_all_cookies() new_cookies = {'name': '_lxsdk_s', 'value': _LXSDK_S} browser.add_cookie(new_cookies) browser.execute_script('window.open("{}")'.format(location)) browser.close() for handle in browser.window_handles: browser.switch_to.window(handle) if '403' in browser.page_source: logger.error('got 403 {}'.format(location)) return WebDriverWait(browser, 10, 0.5).until( expected_conditions.presence_of_element_located( (By.CLASS_NAME, 'rest-li'))) restaurant_list_page = browser.page_source self.get_restaurant_data(restaurant_list_page) restaurant_list = browser.find_elements_by_css_selector( 'div.restaurant')
def get_ele_restaurants(geohash, latitude, longitude, cookies, offset=0, limit=24): url = 'https://www.ele.me/restapi/shopping/restaurants' params = { 'geohash': geohash, 'latitude': latitude, 'longitude': longitude, 'offset': offset, 'limit': limit, 'extras[]': 'activities' } try: resp = requests.get(url, timeout=5, params=params, cookies=cookies) logger.info(resp.headers) if resp.status_code == 200: data = resp.json() for item in data: image_path = item['image_path'] save_ele_restaurants.put( source=SOURCE.ELE, restaurant_id=item['id'], name=item['name'], sales=item['recent_order_num'], arrive_time=item['order_lead_time'], send_fee=item['float_delivery_fee'], score=item['rating'], latitude=item['latitude'], longitude=item['longitude'], image='https://fuss10.elemecdn.com/{}/{}/{}.{}'.format(image_path[0:1], image_path[1:3], image_path[3:], image_path[32:]) ) return data except Exception as e: logger.error(e, exc_info=True)
def get_area_page(key, lat, lng): url = 'http://waimai.meituan.com/geo/geohash' query = {'lat': lat, 'lng': lng, 'addr': key, 'from': 'm'} headers = { 'host': 'waimai.meituan.com', 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' } cookies = {'_lxsdk_s': _LXSDK_S} location = None try: resp = requests.get(url, params=query, timeout=5, headers=headers, allow_redirects=False, cookies=cookies) logger.info('get home page resp: {} {} {}'.format( resp.status_code, resp.content, resp.headers)) if resp.status_code == 200: resp.encoding = 'utf-8' location = resp.json() elif resp.status_code == 302: location = resp.headers.get('location') else: logger.error(resp.content) except Exception as e: logger.error(e, exc_info=True) finally: return location
def run(self): self.subscribe() while True: if self.signal_shutdown: break if self.signal_shutdown: logger.info("graceful shutdown") break job = self.beanstalk.reserve( timeout=self.reserve_timeout) # 阻塞获取任务,最长等待 timeout if not job: continue try: self.on_job(job) self.delete_job(job) except beanstalkc.CommandFailed as e: logger.error(e, exc_info=True) except Exception as e: logger.error(e, exc_info=True) kicks = job.stats()['kicks'] if kicks < 3: self.bury_job(job) else: message = json.loads(job.body) logger.error("Kicks reach max. Delete the job", extra={'body': message}) self.delete_job(job)
def wrapper(*args, **kwargs): u_id = request.cookies.get('u_id') authorized = u_id and session.get(u_id) logger.info(u'authorize user_id:{}, authorized:{}'.format( u_id, authorized)) if not authorized: return jsonify(message=u"需要登录"), 401 return func(*args, **kwargs)
def on_job(cls, job): start = time.time() msg = json.loads(job.body) tube = msg.get('tube') func_name = msg.get('func_name') try: func = Subscriber.FUN_MAP[tube][func_name] kwargs = msg.get('kwargs') logger.info(u'run {} args:{}'.format(func_name, kwargs)) func(**kwargs) except Exception as e: logger.error(e.message, exc_info=True) cost = time.time() - start logger.info('{} cost {} s'.format(func_name, cost))
def search_ele_address(key, latitude, longitude): url = 'https://www.ele.me/restapi/v2/pois' _geohash = geohash.encode(latitude=float( latitude), longitude=float(longitude)) logger.info('geohash: {}'.format(_geohash)) params = { 'extras[]': 'count', 'geohash': _geohash, 'keyword': key, 'limit': 20, 'type': 'nearby' } try: resp = requests.get(url, timeout=5, params=params) if resp.status_code == 200: data = resp.json() return data except Exception as e: logger.error(e, exc_info=True)
def before_request(): logger.info(request.path)
def get_crawler_status(crawler_id): u_id = request.cookies.get('u_id') crawler = CrawlerDao.get_by_id(crawler_id, u_id) logger.info(crawler['count']) return jsonify(crawler)
def __init__(self, func, tube): logger.info('register func:{} to tube:{}.'.format(func.__name__, tube)) Subscriber.FUN_MAP.setdefault(tube, {}) Subscriber.FUN_MAP[tube][func.__name__] = func