Example #1
0
    def item(self):
        url = self.task['url']
        html = get_html(url, cookies={"_lxsdk_s": "%7C%7C0"})
        if not html:
            return
        try:
            root = etree.HTML(html)
            category1, category2, *category3 = root.xpath('//span[@class="bread-name"]/text()')
            name = "".join(root.xpath('//div[@class="shop-name"]/h1/text()'))
            avg_score = "".join(root.xpath('//a[@href="#t-comment"]/span[@itemprop="count"]/text()')) + "封点评"
            avg_price = "".join(root.xpath('//em[@class="average"]/text()'))
            address = "".join(root.xpath('//div[@class="fl"]/span[@class="fl"]/text()'))
            phone = "".join(root.xpath('//span[@class="icon-phone"]/text()'))
            open_time = ""
            extra_info = root.xpath('//div[@class="recommend"]/span/text()')

            resp = {
                "url": url,
                "category1": category1[:-2],
                "category2": category2,
                "category3": "" if not category3 else "".join(category3),
                "name": name,
                "avg_star": avg_score,
                "avg_price": avg_price,
                "address": address,
                "phone": phone,
                "open_time": open_time,
                "extra_info": extra_info,
                "belong": self.belong
            }
            return resp
        except Exception as e:
            logger.error("url:{}".format(url))
            logger.exception(e)
            return
Example #2
0
 def save_info(cls, **payload):
     ins = cls()
     for key, value in payload.items():
         setattr(ins, key, value)
     try:
         ins.save()
     except Exception as e:
         logger.error("insert db error:{}".format(str(e)))
         return
    def item(self):
        url = self.task['url']
        payload = requests.post(
            self.render_js_url,
            json={
                "url": url,
                "script":
                "() => {return {state:window.__LEGO_WIDGETS_FALLBACK__}}"
            }).json()

        if payload['code'] == 0:
            return

        result = payload['result']
        try:
            for s in result["state"]:
                if s['name'] == 'lego-widget-mtpc-shop-sidebar-widgets':
                    map_info = s['params']['mapInfo']
                    shop_info = s['params']['shopInfo']

                    category1, category2, category3 = map_info[
                        "cityName"] + "美团", map_info["cityName"] + "学习培训", ""
                    name = map_info["shopName"]
                    avg_score = shop_info['star']
                    avg_price = ""
                    address = shop_info['address']
                    phone = shop_info['phoneNo']
                    open_time = ""
                    extra_info = []
                    lat, lng = map_info["glat"], map_info["glng"]
                    resp = {
                        "url": url,
                        "category1": category1[:-2],
                        "category2": category2,
                        "category3": category3,
                        "name": name,
                        "avg_star": str(avg_score),
                        "avg_price": avg_price,
                        "address": address,
                        "phone": phone,
                        "open_time": open_time,
                        "extra_info": extra_info,
                        "belong": self.belong,
                        "lat": str(lat),
                        "lng": str(lng)
                    }

                    return resp
        except Exception as e:
            logger.error("url:{}, result:{}".format(self.task['url'], payload))
            logger.exception(e)
            return
Example #4
0
    def item(self):
        url = self.task["url"]
        payload = requests.post(
            self.render_js_url,
            json={
                "url": url,
                "script":
                "() => {return {state:window.__LEGO_WIDGETS_FALLBACK__}}"
            }).json()

        if payload['code'] == 0:
            return
        result = payload['result']

        try:
            for s in result["state"]:
                if s['name'] == 'lego-widget-play-mt-map':
                    poi_info = s['params']['poiInfo']
                    category1, category2, category3 = [
                        i['title'] for i in poi_info['breadCrumbNavDTOList']
                    ]
                    name = poi_info["shopName"]
                    avg_score = poi_info['score']
                    avg_price = poi_info['avgPrice']
                    address = poi_info['address']
                    phone = poi_info['phone']
                    open_time = poi_info['openTime']
                    extra_info = [{"wifi": poi_info['wifi']}]
                    lat, lng = poi_info['lat'], poi_info['lng']

                    resp = {
                        "url": url,
                        "category1": category1[:-2],
                        "category2": category2,
                        "category3": category3,
                        "name": name,
                        "avg_star": str(avg_score / 10),
                        "avg_price": str(avg_price),
                        "address": address,
                        "phone": phone,
                        "open_time": open_time,
                        "extra_info": extra_info,
                        "belong": self.belong,
                        'lat': str(lat),
                        'lng': str(lng)
                    }
                    return resp
        except Exception as e:
            logger.error("url:{}, result:{}".format(self.task['url'], payload))
            logger.exception(e)
            return
Example #5
0
    def item(self):
        url = self.task["url"]

        payload = requests.post(self.render_js_url,
                                json={
                                    "url":
                                    url,
                                    "script":
                                    "() => {return {state:window._appState}}"
                                }).json()

        if payload["code"] == 0:
            return
        var = payload["result"]

        try:
            # 北京美团,北京美食,北京自助餐

            category1, category2, category3 = var['state']['crumbNav']
            _detail_info = var['state']['detailInfo']
            name = _detail_info['name']
            avg_star = _detail_info['avgScore']
            avg_price = _detail_info['avgPrice']
            address = _detail_info['address']
            phone = _detail_info['phone']
            open_time = _detail_info['openTime']
            extra_info = _detail_info['extraInfos']
            lat, lng = _detail_info['latitude'], _detail_info['longitude']
            return {
                "url": url,
                "category1": category1.get('title', "")[:-2],
                "category2": category2.get('title', ""),
                "category3": category3.get('title', ""),
                "name": name,
                "avg_star": str(avg_star),
                "avg_price": str(avg_price),
                "address": address,
                "phone": phone,
                "open_time": open_time,
                "extra_info": [i['text'] for i in extra_info],
                "lat": str(lat),
                "lng": str(lng),
                "belong": self.belong
            }

        except Exception as e:
            logger.error("url:{}, result:{}".format(self.task['url'], payload))
            logger.exception(e)
            return
Example #6
0
    def item(self):
        payload = requests.post(self.render_js_url,
                                json={
                                    "url":
                                    self.task['url'],
                                    "script":
                                    "() => {return {state:window.AppData}}"
                                }).json()

        if payload['code'] == 0:
            return

        try:
            result = payload['result']
            poi_info = result['state']['poiInfo']
            category1 = poi_info["cityName"]
            category2, category3 = [
                category1 + i["title"] for i in poi_info["crumbs"]
            ]
            name = poi_info["name"]
            avg_score = poi_info["score"]
            avg_price = poi_info["avgPrice"]
            address = poi_info["address"]
            phone = poi_info["phone"]
            open_time = poi_info["openTime"]
            extra_info = [{"wifi": poi_info["wifi"], "park": poi_info["park"]}]
            lat, lng = poi_info['lat'], poi_info['lng']
            resp = {
                "url": self.task['url'],
                "category1": category1,
                "category2": category2,
                "category3": category3,
                "name": name,
                "avg_star": str(avg_score),
                "avg_price": str(avg_price),
                "address": address,
                "phone": phone,
                "open_time": open_time,
                "extra_info": extra_info,
                "lat": str(lat),
                "lng": str(lng),
                "belong": self.belong
            }
            return resp
        except Exception as e:
            logger.error("url:{}, result:{}".format(self.task['url'], payload))
            logger.exception(e)
            return
Example #7
0
 def get_data(self, task):
     logger.info('start get data task.')
     url = task['url']
     category = re.match(self.data_regex, url).group(1)
     obj = CategoryFactory.get(category)
     if not obj:
         return
     ins = obj(task, self)
     item = ins.item()
     # 目前返回3种情况, True表示来自cate的,None表示失败,字典表示成功
     if item is True:
         # 表示cate的
         return
     if not item:
         logger.error("获取详情失败")
         spider_server.task.failure(task, [url])
     else:
         Meituan.save_info(**item)
         spider_server.task.finish(task, [url])
    def item(self):
        url = self.task['url']
        html = get_html(url, cookies={"_lxsdk_s": "%7C%7C0"})
        if not html:
            return
        root = etree.HTML(html)
        try:
            category1, category2, *category3 = root.xpath(
                '//div[@class="breadcrumb-wrapper"]/ul/li/a/text()')
            name = "".join(root.xpath('//div[@class="shop-name"]/h1/text()'))
            avg_score = "".join(root.xpath('//a[@href="#t-comment"]/text()'))
            avg_price = ""
            address = "".join(
                root.xpath('//p[@class="shop-contact address"]/text()'))
            phone = "".join(
                root.xpath(
                    '//div[@class="shop-contact telAndQQ"]/span/strong/text()')
            )
            open_time = "".join(
                root.xpath('//p[@class="shop-contact"]/text()'))
            extra_info = root.xpath(
                '//div[@class="material-shop__special-services js_dialog-services"]/ul/li/text()'
            )

            resp = {
                "url": url,
                "category1": category1[:-2],
                "category2": category2,
                "category3": "" if not category3 else "".join(category3),
                "name": name,
                "avg_star": avg_score,
                "avg_price": avg_price,
                "address": address,
                "phone": phone,
                "open_time": open_time,
                "extra_info": extra_info,
                "belong": self.belong
            }
            return resp
        except Exception as e:
            logger.error("url:{}".format(url))
            logger.exception(e)
            return
Example #9
0
 def deal_forbidden(self, url, headers, proxies):
     count = 0
     try:
         while True:
             html = get_html(url, headers, proxies, cookies={"_lxsdk_s": "%7C%7C0"})
             count += 1
             if count > 3:
                 # 改成返回空字符串,然后以失败任务处理.
                 return ''
             if not html:
                 continue
             root = etree.HTML(html)
             exist = root.xpath('//p[@class="error-word"]')
             logger.info("deal_forbidden:is forbidden:{}".format(exist))
             if exist:
                 continue
             else:
                 return html
     except Exception as e:
         logger.error("deal_forbidden:{}".format(str(e)))
Example #10
0
    def item(self):
        from meituan.entities.factory import CategoryFactory
        # 此处处理跳转情况
        try:
            response = basic_request(self.task['url'])
            if not response.history:
                # todo: 表示没有跳转,那么这个就不处理了
                return True
            location = response.history[0].headers.get("Location")
            if location:
                url = parse_url(self.task['url'], location)
                category = re.match(self.spider.data_regex, url).group(1)
                obj = CategoryFactory.get(category)
                # 替换成新的
                self.task['url'] = url
                ins = obj(self.task, self.spider)
                item = ins.item()
                return item

        except Exception as e:
            logger.error("url:{}".format(self.task['url']))
            logger.exception(e)
            return