Beispiel #1
0
 def crawl(self):
     location = get_area_page(self.address, self.lat, self.lng)
     if not location or '404' in location:
         logger.error(u'{} not find location page.'.format(self.address))
         return
     logger.info(u'{} location page: {}'.format(self.address, location))
     headers = [
         'host="waimai.meituan.com"',
         'referer="http://waimai.meituan.com/"',
         'user-agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"'
     ]
     browser = get_headless_chrome(headers)
     browser.get(location)
     browser.delete_all_cookies()
     new_cookies = {'name': '_lxsdk_s', 'value': _LXSDK_S}
     browser.add_cookie(new_cookies)
     browser.execute_script('window.open("{}")'.format(location))
     browser.close()
     for handle in browser.window_handles:
         browser.switch_to.window(handle)
     if '403' in browser.page_source:
         logger.error('got 403 {}'.format(location))
         return
     WebDriverWait(browser, 10, 0.5).until(
         expected_conditions.presence_of_element_located(
             (By.CLASS_NAME, 'rest-li')))
     restaurant_list_page = browser.page_source
     self.get_restaurant_data(restaurant_list_page)
     restaurant_list = browser.find_elements_by_css_selector(
         'div.restaurant')
Beispiel #2
0
def get_ele_restaurants(geohash, latitude, longitude, cookies, offset=0, limit=24):
    url = 'https://www.ele.me/restapi/shopping/restaurants'
    params = {
        'geohash': geohash,
        'latitude': latitude,
        'longitude': longitude,
        'offset': offset,
        'limit': limit,
        'extras[]': 'activities'
    }
    try:
        resp = requests.get(url, timeout=5, params=params, cookies=cookies)
        logger.info(resp.headers)
        if resp.status_code == 200:
            data = resp.json()
            for item in data:
                image_path = item['image_path']
                save_ele_restaurants.put(
                    source=SOURCE.ELE,
                    restaurant_id=item['id'],
                    name=item['name'],
                    sales=item['recent_order_num'],
                    arrive_time=item['order_lead_time'],
                    send_fee=item['float_delivery_fee'],
                    score=item['rating'],
                    latitude=item['latitude'],
                    longitude=item['longitude'],
                    image='https://fuss10.elemecdn.com/{}/{}/{}.{}'.format(image_path[0:1], image_path[1:3],
                                                                           image_path[3:],
                                                                           image_path[32:])
                )
            return data
    except Exception as e:
        logger.error(e, exc_info=True)
Beispiel #3
0
def get_area_page(key, lat, lng):
    url = 'http://waimai.meituan.com/geo/geohash'
    query = {'lat': lat, 'lng': lng, 'addr': key, 'from': 'm'}
    headers = {
        'host':
        'waimai.meituan.com',
        'user-agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    }
    cookies = {'_lxsdk_s': _LXSDK_S}
    location = None
    try:
        resp = requests.get(url,
                            params=query,
                            timeout=5,
                            headers=headers,
                            allow_redirects=False,
                            cookies=cookies)
        logger.info('get home page resp: {} {} {}'.format(
            resp.status_code, resp.content, resp.headers))
        if resp.status_code == 200:
            resp.encoding = 'utf-8'
            location = resp.json()
        elif resp.status_code == 302:
            location = resp.headers.get('location')
        else:
            logger.error(resp.content)
    except Exception as e:
        logger.error(e, exc_info=True)
    finally:
        return location
Beispiel #4
0
 def run(self):
     self.subscribe()
     while True:
         if self.signal_shutdown:
             break
         if self.signal_shutdown:
             logger.info("graceful shutdown")
             break
         job = self.beanstalk.reserve(
             timeout=self.reserve_timeout)  # 阻塞获取任务,最长等待 timeout
         if not job:
             continue
         try:
             self.on_job(job)
             self.delete_job(job)
         except beanstalkc.CommandFailed as e:
             logger.error(e, exc_info=True)
         except Exception as e:
             logger.error(e, exc_info=True)
             kicks = job.stats()['kicks']
             if kicks < 3:
                 self.bury_job(job)
             else:
                 message = json.loads(job.body)
                 logger.error("Kicks reach max. Delete the job",
                              extra={'body': message})
                 self.delete_job(job)
 def wrapper(*args, **kwargs):
     u_id = request.cookies.get('u_id')
     authorized = u_id and session.get(u_id)
     logger.info(u'authorize user_id:{}, authorized:{}'.format(
         u_id, authorized))
     if not authorized:
         return jsonify(message=u"需要登录"), 401
     return func(*args, **kwargs)
Beispiel #6
0
 def on_job(cls, job):
     start = time.time()
     msg = json.loads(job.body)
     tube = msg.get('tube')
     func_name = msg.get('func_name')
     try:
         func = Subscriber.FUN_MAP[tube][func_name]
         kwargs = msg.get('kwargs')
         logger.info(u'run {} args:{}'.format(func_name, kwargs))
         func(**kwargs)
     except Exception as e:
         logger.error(e.message, exc_info=True)
     cost = time.time() - start
     logger.info('{} cost {} s'.format(func_name, cost))
Beispiel #7
0
def search_ele_address(key, latitude, longitude):
    url = 'https://www.ele.me/restapi/v2/pois'
    _geohash = geohash.encode(latitude=float(
        latitude), longitude=float(longitude))
    logger.info('geohash: {}'.format(_geohash))
    params = {
        'extras[]': 'count',
        'geohash': _geohash,
        'keyword': key,
        'limit': 20,
        'type': 'nearby'
    }
    try:
        resp = requests.get(url, timeout=5, params=params)
        if resp.status_code == 200:
            data = resp.json()
            return data
    except Exception as e:
        logger.error(e, exc_info=True)
def before_request():
    logger.info(request.path)
def get_crawler_status(crawler_id):
    u_id = request.cookies.get('u_id')
    crawler = CrawlerDao.get_by_id(crawler_id, u_id)
    logger.info(crawler['count'])
    return jsonify(crawler)
Beispiel #10
0
 def __init__(self, func, tube):
     logger.info('register func:{} to tube:{}.'.format(func.__name__, tube))
     Subscriber.FUN_MAP.setdefault(tube, {})
     Subscriber.FUN_MAP[tube][func.__name__] = func