Esempio n. 1
0
    def jsonparse(self, response):
        content = json.loads(response.body)
        city_name = response.meta['city_name']
        district_name = response.meta['district_name']
        page = response.meta['page']
        city_id = response.meta['city_id']
        district_id = response.meta['district_id']
        shop_list = content['shopRecordBeanList']
        for info in shop_list:
            items = DianpingItem()
            items['city_name'] = city_name
            items['district_name'] = district_name
            items['shop_id'] = info['shopId']
            items['brand_name'] = info['shopRecordBean']['shopName']
            items['shop_name'] = info['shopRecordBean']['shopTotalName']
            items['shop_address'] = info['address']

            yield items
        pagecount = content['pageCount']
        if page < int(pagecount):
            page += 1
            url = 'http://www.dianping.com/search/map/ajax/json?cityId=' + str(city_id) + \
                  '&categoryId=182&regionId=' + str(district_id) + '&page=' + str(page)
            yield scrapy.Request(url,
                                 headers=headers,
                                 meta={
                                     'city_name': city_name,
                                     'district_name': district_name,
                                     'page': page,
                                     'city_id': city_id,
                                     'district_id': district_id
                                 },
                                 callback=self.jsonparse,
                                 dont_filter=True)
Esempio n. 2
0
    def parseShop(self, response):
        shop_info = re.findall('window.shop_config=(.*?)</script>',
                               response.text, re.S)

        shop_info = json.dumps(shop_info[0])
        shop_info = json.loads(shop_info)
        # title=response.xpath('//*[@id="basic-info"]/h1/text()').extract()
        # address=response.xpath('//*[@id="address"]/text()').extract()
        star = response.xpath(
            '//*[@id="basic-info"]/div[1]/span[1]/@title').extract()
        reviewCount = response.xpath('//*[@id="reviewCount"]/text()').extract()
        price = response.xpath('//*[@id="avgPriceTitle"]/text()').extract()
        flavor = response.xpath(
            '//*[@id="comment_score"]/span[1]/text()').extract()
        environment = response.xpath(
            '//*[@id="comment_score"]/span[2]/text()').extract()
        service = response.xpath(
            '//*[@id="comment_score"]/span[3]/text()').extract()
        comm = response.xpath(
            '//*[@id="summaryfilter-wrapper"]/div/label/span/text()').extract(
            )
        print(star, reviewCount, price, flavor, environment, service, comm)
        item = DianpingItem()
        item['shop_info'] = shop_info
        return item
Esempio n. 3
0
    def parseDetail(self, response):
        jsonResponse = json.loads(response.body.decode(response.encoding))

        data = jsonResponse['data']

        item = DianpingItem()
        item['id'] = data['detail']['offlineActivityId']
        item['title'] = data['detail']['title']
        item['cost'] = data['detail']['cost']
        item['shopAddress'] = data['detail']['activityShopInfoList'][0][
            'shopAddress']
        item['distanceInfo'] = data['detail']['activityShopInfoList'][0][
            'distanceInfo']
        item['distance'] = data['detail']['activityShopInfoList'][0][
            'distance']
        item['score'] = data['detail']['activityShopInfoList'][0]['shopPower']
        item['shopName'] = data['detail']['activityShopInfoList'][0][
            'shopName']
        item['shopType'] = data['detail']['activityShopInfoList'][0][
            'shopType']

        if len(data['detail']['offlineActivityTagDTOList']) > 0:
            item['tagId'] = data['detail']['offlineActivityTagDTOList'][0][
                'tagId']
            item['tagName'] = data['detail']['offlineActivityTagDTOList'][0][
                'tagName']
        else:
            item['tagId'] = 0
            item['tagName'] = ''

        item['like'] = ''
        item['apply_result'] = ''

        yield item
Esempio n. 4
0
 def parse(self, response):
     item = DianpingItem()
     item['shop_name'] = response.xpath(
         '//*[@id="shop-all-list"]/ul/li/div[2]/div[1]/a/h4/text()'
     ).extract()
     item['shop_city'] = response.xpath(
         '//*[@id="page-header"]/div[1]/a[2]/text()').extract()
     item['shop_address_1'] = response.xpath(
         '//*[@id="shop-all-list"]/ul/li/div[2]/div[3]/a[2]/span/text()'
     ).extract()
     item['shop_address_2'] = response.xpath(
         '//*[@id="shop-all-list"]/ul/li/div[2]/div[3]/span/text()'
     ).extract()
     #shop_tel
     # item['shop_star'] = (response.xpath('//*[@id="shop-all-list"]/ul/li/div[2]/div[2]/span/'))[0].attrib.get('title')
     item['com_num'] = response.xpath(
         '//*[@id="shop-all-list"]/ul/li/div[2]/div[2]/a[1]/b/text()'
     ).extract()
     item['price_avg'] = response.xpath(
         '//*[@id="shop-all-list"]/ul/li/div[2]/div[2]/a[2]/b/text()'
     ).extract()
     item['tag_name'] = response.xpath(
         '//*[@id="shop-all-list"]/ul/li/div[2]/div[3]/a[1]/span/text()'
     ).extract()
     item['kou_wei'] = response.xpath(
         '//*[@id="shop-all-list"]/ul/li/div[2]/span/span[1]/b/text()'
     ).extract()
     item['huan_jing'] = response.xpath(
         '//*[@id="shop-all-list"]/ul/li/div[2]/span/span[2]/b/text()'
     ).extract()
     item['fu_wu'] = response.xpath(
         '//*[@id="shop-all-list"]/ul/li/div[2]/span/span[3]/b/text()'
     ).extract()
     return item
Esempio n. 5
0
 def parse_detail_de(self, response):
     name_ser = response.meta['name_ser']
     url = response.meta['url']
     print('++++++++++++++++++')
     print(url)
     print(response.url)
     # print(response.body)
     if 'https://verify.meituan.com/v2/web/general_page?' in response.url:
         html = Crack_verification_code.Crack(response.url, url)
     else:
         html = response.body
         # print(response.url.split('&')[1].replace('requestCode=',''))
         # print(html)
     item = DianpingItem()
     soup = BeautifulSoup(str(html), 'lxml')
     shop_name = soup.find('div', class_="shop-name").find('h1').get_text()
     rank_level = soup.find('div', class_="rank").find('span').get('class')
     rank = soup.find('div', class_="rank").find_all('span', class_="item")
     phone = soup.find('div',
                       class_="phone").find_all('span',
                                                class_="item J-phone-hide")
     address = soup.find('div', class_="address").get_text().replace(
         '地址:', '').replace(' ', '').replace('\n', '')
     for i in rank:
         if '效果' in i.get_text():
             item['effection'] = i.get_text().replace('效果:', '')
         if '师资' in i.get_text():
             item['teachers'] = i.get_text().replace('师资:', '')
         if '环境' in i.get_text():
             item['environment'] = i.get_text().replace('环境:', '')
     item['star'] = str(
         int(rank_level[1].replace('mid-str', '')) / 10) + '星'
     item['name'] = shop_name
     phone_list = []
     for i in phone:
         phone_list.append(i.get('data-phone'))
     item['address'] = address
     item['phone'] = phone_list
     class_shop = soup.find_all('div', class_="item notag")
     class_list = []
     for i in class_shop:
         class_dict = {}
         class_dict['class_name'] = i.find('p', class_="title").get_text()
         class_dict['class_price'] = i.find('div', class_="price").find(
             'span', class_="cur").get_text().replace('\n',
                                                      '').replace(' ', '')
     shop_info = soup.find('div',
                           id="info").find('ul',
                                           class_="con").find_all('li')
     for i in shop_info:
         if i.find("span", class_="title").get_text() == "商户介绍":
             item['description'] = i.get_text().replace('\r\n', '').replace(
                 ' ', '').replace('\n', '')
         if i.find("span", class_="title").get_text() == "特色服务":
             character = i.get_text().replace(' ', '').split('\n')
             item['characteristic'] = [i for i in character if i != ''][1:]
     item['_id'] = self.hash_distanct(item['name'], item['address'])
     yield item
Esempio n. 6
0
    def parse_coord(self, response):
        """获取经纬度数据"""
        item = DianpingItem()
        item.update(response.meta['detail'])
        # print(response.url)
        item['tele'] = response.css(
            'p.expand-info.tel::text').extract()[1].strip()
        try:
            coord_detail = [
                x for x in response.css('script').extract()
                if 'window.shop' in x
            ][0]
            lat = re.findall('(?<=shopGlat: ").*?(?=",)', coord_detail)[0]
            lng = re.findall('(?<=shopGlng:").*?(?=",)', coord_detail)[0]
            coord = lat + ',' + lng
        except:
            coord = ''
        item['coord'] = coord

        yield item
    def parse_dir_contents(self, response):
        item = DianpingItem()
        item['shop_id'] = re.search('shopId=(\d+)', response.body).group(1)
        item['shop_name'] = response.xpath(
            '//*[@id="basic-info"]/h1/text()').extract()[0].strip()
        item['good_summary'] = []
        item['last_updated'] = strftime("%Y-%m-%d %H:%M:%S")
        for sel in response.xpath('//span[@class="good J-summary"]'):
            item['good_summary'].append(sel.xpath('a/text()').extract()[0])

        yield item
Esempio n. 8
0
    def parse_page(self, response):
        item = DianpingItem()
        print(response.url)

        for each in response.xpath(
                "//div[@class='reviews-items']/ul/li/div[@class='main-review']"
        ):
            username = each.xpath(
                "./div[@class='dper-info']/a[@class='name']/text()").extract(
                )[0].replace('\n', '').replace(' ', '')
            taste = each.xpath(
                "./div[@class='review-rank']/span[@class='score']/span[@class='item'][1]/text()"
            ).extract()[0].replace('\n', '').replace(' ', '')
            environment = each.xpath(
                "./div[@class='review-rank']/span[@class='score']/span[@class='item'][2]/text()"
            ).extract()[0].replace('\n', '').replace(' ', '')
            service = each.xpath(
                "./div[@class='review-rank']/span[@class='score']/span[@class='item'][3]/text()"
            ).extract()[0].replace('\n', '').replace(' ', '')
            pre_data = each.xpath(
                "./div[@class='review-rank']/span[@class='score']/span[@class='item'][4]/text()"
            ).extract()
            if len(pre_data) > 0:
                pre = pre_data[0].replace('\n', '').replace(' ', '')
            else:
                pre = 'Null'
            comment_data = each.xpath(
                "./div[@class='review-truncated-words']/text()").extract()
            if len(comment_data) > 0:
                comment = comment_data[0].replace('\n', '').replace(
                    '\t', '').replace(' ', '')
            else:
                comment = 'Null'
            cre_time = each.xpath(
                "./div[@class='misc-info clearfix']/span[@class='time']/text()"
            ).extract()[0].replace('\n', '')
            star = each.xpath("./div[@class='review-rank']/span[1]/@class"
                              ).extract()[0].replace(
                                  'sml-rank-stars',
                                  '').replace('sml-str', '').replace(
                                      'star', '').replace(' ',
                                                          '').replace('0', '')
            item['username'] = username
            item['taste'] = taste
            item['environment'] = environment
            item['service'] = service
            item['pre'] = pre
            item['comment'] = comment
            item['cre_time'] = cre_time
            item['star'] = star
            print(item)
            yield item
Esempio n. 9
0
    def parse_info(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.select("//dd[child::ul[@class='remark']]")

        for site in sites:
            item = DianpingItem()
            item['name'] = site.select("descendant::li[@class='shopname']/a/text()").extract()
            shoplink = site.select("descendant::li[@class='shopname']/a[1]/@href").extract()
            shoplink = shoplink[0]
            shopID = re.search("shopId=(\d+)#", shoplink).groups()[0]
          

            item['tag'] = site.select("descendant::li[@class='tags']/descendant::text()").extract()
            item['avgPrice'] = site.select("descendant::strong[@class='average']/text()").extract()
            item['stars'] = site.select("descendant::span[contains(@class,'item-rank-rst')]/@title").extract()
        
            self.items_buffer[shopID] = item
            log.msg("ken: yield link:%s"%self.base_url+shoplink)
            yield Request(url=self.base_url+shoplink, callback=self.parse_details)
Esempio n. 10
0
    def parse(self, response):


        # if (('Location' in response.headers) and (response.headers['Location'] != response.request.url)):
        #     yield scrapy.Request(url=response.headers['Location'], callback=self.parse)
        #     return
        # else:
        #     from scrapy.shell import inspect_response
        #     inspect_response(response, self)

        # print sys.stdout.encoding

        # print sys.getdefaultencoding()
        for sel in response.css('p.desc.J-desc'):
            item = DianpingItem()
            # item['titleCSSSelector'] = sel.css('a').select('@href').extract_first()
            # item['title'] = sel.xpath('./text()').extract_first()
            item['title'] = ''.join(sel.xpath('./text()').extract())
            item['link'] = ''  # sel.xpath('a/@href').extract_first()
            item['desc'] = ''  # sel.xpath('text()').extract_first()
            yield item
Esempio n. 11
0
    def parseResponse(self, response, N):

        selector = Selector(response)

        infos = selector.xpath('//li/div[2]')

        for info in infos:

            item = DianpingItem()
            rank = info.xpath('div[1]/span/@title').extract()
            # time = info.xpath('div[3]/span/text()').extract()
            desc = info.xpath('div[2]/div/text()').extract()
            airline = info.xpath('div[3]/h2/text()').extract()


            item['estar'] = rank
            # item['time'] = time
            item['desc'] = replace_escape_chars(remove_tags(desc[0]),
                        which_ones=('\n', '\t', '\r',' '))
            item['airline'] = airline
            yield item
Esempio n. 12
0
    def parse_dianping(self, response):
        item = DianpingItem()
        item['shop_name'] = response.xpath(
            '//h1[@class="shop-name"]//text()').extract()[0].strip()
        item['shop_address'] = response.xpath(
            '//div[@class="expand-info address"]//'
            'span[@itemprop="street-address"]/@title').extract()[0]
        lng_atr = response.xpath('//div[@id="aside"]/script/text()')\
            .re(r"lng:(\d*.\d*),lat:(\d*.\d*)")
        try:
            item['shop_longitude'], item['shop_latitude'] = lng_atr
        except ValueError as error:
            item['shop_longitude'], item['shop_latitude'] = 0, 0
            print "There is no longitude nor latitude of the shop!"
        item['shop_city'] = response.xpath(
            '//a[@class="city J-city"]//text()').extract()[0].strip()
        item['shop_region'] = response.xpath(
            '//span[@itemprop="locality region"]//text()').extract()[0].strip(
            )

        self.shops_count += 1
        print "%d shops are crawled." % self.shops_count
        yield item
Esempio n. 13
0
    def parseDoApply(self, response):
        jsonResponse = json.loads(response.body.decode(response.encoding))

        data = jsonResponse['data']

        activityInfo = response.meta['activityInfo']

        item = DianpingItem()
        item['id'] = activityInfo['id']
        item['title'] = activityInfo['title']
        item['cost'] = activityInfo['cost']
        item['shopAddress'] = activityInfo['shopAddress']
        item['distanceInfo'] = activityInfo['distanceInfo']
        item['distance'] = activityInfo['distance']
        item['score'] = activityInfo['score']
        item['shopName'] = activityInfo['shopName']
        item['shopType'] = activityInfo['shopType']
        item['tagId'] = activityInfo['tagId']
        item['tagName'] = activityInfo['tagName']
        item['like'] = activityInfo['like']
        item['apply_result'] = data['desc']

        yield item
Esempio n. 14
0
    def parse_comment(self, response):
        item = DianpingItem()
        categoryLevelA_range = ['面包甜点', '自助餐', '咖啡厅', '西餐', '台湾菜', '贵州菜', '江西菜', '东南亚菜', '其他', '俄罗斯菜', '新疆菜', '粤菜', '素菜', '日本料理', '日本菜', '云贵菜', '小吃快餐', '家常菜', '私房菜', '串串香', '本帮江浙菜', '江浙菜', '苏州江浙', '烧烤', '烤鱼', '鲁菜', '客家菜', '南京/江浙菜', '蟹宴', '茶馆', '创意菜', '面馆', '酒吧', '北京菜', '快餐简餐', '小吃', '海鲜', '火锅', '湘菜', '川菜', '兔头/兔丁', '西北菜', '粥粉面', '云南菜', '粤菜/潮州菜', '东北菜', '农家菜', '小龙虾', '大闸蟹', '粉面馆', '湖北菜', '杭帮/江浙菜', '茶餐厅', '徽菜', '闽菜', '韩国料理']
        customers = response.xpath('//div[@class = "comment-list"]/ul/li')
        crumb = response.xpath('//div[@class = "crumb"]//li')
        if len(crumb) == 7:
            areaName = crumb[2].xpath('strong//span/text()').extract()[0]
            categoryLevelA = crumb[3].xpath('strong//span/text()').extract()[0]
            categoryLevelB = crumb[4].xpath('strong//span/text()').extract()[0]
        elif len(crumb) == 6:
            categoryLevelA = crumb[-3].xpath('strong//span/text()').extract()[0]
            if categoryLevelA in categoryLevelA_range:
                categoryLevelA = categoryLevelA
                categoryLevelB = categoryLevelA
                areaName = crumb[-4].xpath('strong//span/text()').extract()[0]
            else:
                categoryLevelB = categoryLevelA
                categoryLevelA = crumb[-4].xpath('strong//span/text()').extract()[0]
                areaName = crumb[-5].xpath('strong//span/text()').extract()[0]
        else:
            areaName = crumb[1].xpath('strong//span/text()').extract()[0]
            categoryLevelA = crumb[-3].xpath('strong//span/text()').extract()[0]
            categoryLevelB = categoryLevelA
        if len(customers) == 0:
            item['distinctName'] = response.meta['distinctName']
            item['cityName'] = response.meta['cityName']
            item['areaName'] = areaName
            item['categoryLevelA'] = categoryLevelA
            item['categoryLevelB'] = categoryLevelB
            item['restaurantName'] = response.meta['restaurantName']
            item['restaurantStar'] = response.meta['restaurantStar']
            item['scoreOfTaste'] = response.meta['scoreOfTaste']
            item['scoreOfEnvironment'] = response.meta['scoreOfEnvironment']
            item['scoreOfService'] = response.meta['scoreOfService']
            item['averageCost'] = response.meta['averageCost']
            item['restaurantAddress'] = response.meta['restaurantAddress']
            item['restaurantTel'] = response.meta['restaurantTel']
            item['commentCount'] = response.meta['commentCount']
            item['commentSum'] = response.meta['commentSum']
            item['rankTotal_5_Count'] = ''
            item['rankTotal_4_Count'] = ''
            item['rankTotal_3_Count'] = ''
            item['rankTotal_2_Count'] = ''
            item['rankTotal_1_Count'] = ''
            item['customerName'] = ''
            item['customerLevel'] = ''
            item['customerVIP'] = ''
            item['commRankTotal'] = ''
            item['commRankTaste'] = ''
            item['commRankEnvironment'] = ''
            item['commRankService'] = ''
            item['commCostPer'] = ''
            item['commentContent'] = ''
            item['commentDate'] = ''
            item['commentLiked'] = ''
            yield item
        else:
            rankTotal_5_Count = response.xpath('//div[@class = "comment-star"]/dl/dd[2]//text()').extract()[1]
            rankTotal_5_Count = int(rankTotal_5_Count.replace('(','').replace(')',''))
            rankTotal_4_Count = response.xpath('//div[@class = "comment-star"]/dl/dd[3]//text()').extract()[1]
            rankTotal_4_Count = int(rankTotal_4_Count.replace('(','').replace(')',''))
            rankTotal_3_Count = response.xpath('//div[@class = "comment-star"]/dl/dd[4]//text()').extract()[1]
            rankTotal_3_Count = int(rankTotal_3_Count.replace('(','').replace(')',''))
            rankTotal_2_Count = response.xpath('//div[@class = "comment-star"]/dl/dd[5]//text()').extract()[1]
            rankTotal_2_Count = int(rankTotal_2_Count.replace('(','').replace(')',''))
            rankTotal_1_Count = response.xpath('//div[@class = "comment-star"]/dl/dd[6]//text()').extract()[1]
            rankTotal_1_Count = int(rankTotal_1_Count.replace('(','').replace(')',''))
            for customer in customers:
                distinctName = response.meta['distinctName']
                cityName = response.meta['cityName']
                areaName = areaName
                categoryLevelA = categoryLevelA
                categoryLevelB = categoryLevelB
                restaurantName = response.meta['restaurantName']
                restaurantStar = response.meta['restaurantStar']
                scoreOfTaste = response.meta['scoreOfTaste']
                scoreOfEnvironment = response.meta['scoreOfEnvironment']
                scoreOfService = response.meta['scoreOfService']
                averageCost = response.meta['averageCost']
                restaurantAddress = response.meta['restaurantAddress']
                restaurantTel = response.meta['restaurantTel']
                commentCount = response.meta['commentCount']
                commentSum = response.meta['commentSum']

                customerName = customer.xpath('div[1]//p[@class = "name"]/a/text()').extract()[0]
                customerLevel = customer.xpath('div[1]//p[@class = "contribution"]/span/@title').extract()[0]
                if customerLevel == '':
                    customerLevel = 1
                elif '200' in customerLevel:
                    customerLevel = 2
                elif '400' in customerLevel:
                    customerLevel = 3
                elif '1000' in customerLevel:
                    customerLevel = 4
                elif '2000' in customerLevel:
                    customerLevel = 5
                elif '5000' in customerLevel:
                    customerLevel = 6
                else:
                    customerLevel = ''
                customerVIP = customer.xpath('div[1]//i[@class = "icon-vip"]').extract()
                if len(customerVIP) != 0:
                    customerVIP = 1
                else:
                    customerVIP =0
                try:
                    commRankTotal = customer.xpath('div[2]//div[@class = "user-info"]/span[1]/@class').extract()[0]
                    commRankTotal = int(commRankTotal[-2])
                except:
                    commRankTotal = ''
                rankList = customer.xpath('div[2]//div[@class = "comment-rst"]/span/text()').extract()
                try:
                    commRankTaste = int(rankList[0][-1])
                    commRankEnvironment = int(rankList[1][-1])
                    commRankService = int(rankList[2][-1])
                except:
                    commRankTaste = ''
                    commRankEnvironment = ''
                    commRankService = ''
                try:
                    commCostPer = customer.xpath('div[2]//div[@class = "user-info"]/span[2]/text()').extract()[0]
                    commCostPer = int(re.match(r'.*?(\d+)', commCostPer).group(1))
                except:
                    commCostPer = ''
                commentContent = ('\n'.join(customer.xpath('div[2]//div[@class = "J_brief-cont"]//text()').extract())).strip()
                commentDate = customer.xpath('div[2]//div[@class = "misc-info"]/span[@class = "time"]/text()').extract()[0]
                commentLiked = customer.xpath('div[2]//span[@class = "col-right"]/span[1]/a/span/text()').extract()
                try:
                    commentLiked = commentLiked[1].replace('(','').replace(')','')
                    commentLiked = int(commentLiked)
                except:
                    commentLiked = ''
                item['cityName'] = cityName
                item['distinctName'] = distinctName
                item['areaName'] = areaName
                item['categoryLevelA'] = categoryLevelA
                item['categoryLevelB'] = categoryLevelB
                item['restaurantName'] = restaurantName
                item['restaurantStar'] = restaurantStar
                item['scoreOfTaste'] = scoreOfTaste
                item['scoreOfEnvironment'] = scoreOfEnvironment
                item['scoreOfService'] = scoreOfService
                item['averageCost'] = averageCost
                item['restaurantAddress'] = restaurantAddress
                item['restaurantTel'] = restaurantTel
                item['commentCount'] = commentCount
                item['commentSum'] = commentSum
                item['rankTotal_5_Count'] = rankTotal_5_Count
                item['rankTotal_4_Count'] = rankTotal_4_Count
                item['rankTotal_3_Count'] = rankTotal_3_Count
                item['rankTotal_2_Count'] = rankTotal_2_Count
                item['rankTotal_1_Count'] = rankTotal_1_Count
                item['customerName'] = customerName
                item['customerLevel'] = customerLevel
                item['customerVIP'] = customerVIP
                item['commRankTotal'] = commRankTotal
                item['commRankTaste'] = commRankTaste
                item['commRankEnvironment'] = commRankEnvironment
                item['commRankService'] = commRankService
                item['commCostPer'] = commCostPer
                item['commentContent'] = commentContent
                item['commentDate'] = commentDate
                item['commentLiked']= commentLiked
                yield item
            try:
                origin_url = re.match(r'(http:.*?more)', response.url).group(1)        #后面重新调用此函数,response.url会改变,需要re去掉?参数,变回最初的url
                nextlink = origin_url + response.xpath('//div[@class = "Pages"]/a[last()]/@href').extract()[0]
                yield Request(nextlink, meta = {
                'distinctName': distinctName,
                'cityName': cityName,
                #'areaName': areaName,
                #'categoryLevelA': categoryLevelA,
                #'categoryLevelB': categoryLevelB,
                'restaurantName': restaurantName,
                'restaurantStar': restaurantStar,
                'scoreOfTaste': scoreOfTaste,
                'scoreOfEnvironment': scoreOfEnvironment,
                'scoreOfService': scoreOfService,
                'averageCost': averageCost,
                'restaurantAddress': restaurantAddress,
                'restaurantTel': restaurantTel,
                'commentCount': commentCount,
                'commentSum': commentSum},
                              callback = self.parse_comment)
            except:
                pass
Esempio n. 15
0
    def parseDetail(self, response):
        jsonResponse = json.loads(response.body.decode(response.encoding))

        data = jsonResponse['data']

        if len(data['detail']['offlineActivityTagDTOList']) > 0:
            tagId = data['detail']['offlineActivityTagDTOList'][0]['tagId']
        else:
            tagId = 0

        features = {
            'cost': data['detail']['cost'],
            'distance': data['detail']['activityShopInfoList'][0]['distance'],
            'score': data['detail']['activityShopInfoList'][0]['shopPower'],
            'tagId': tagId
        }

        activityInfo = {
            'id':
            data['detail']['offlineActivityId'],
            'title':
            data['detail']['title'],
            'cost':
            data['detail']['cost'],
            'shopAddress':
            data['detail']['activityShopInfoList'][0]['shopAddress'],
            'distanceInfo':
            data['detail']['activityShopInfoList'][0]['distanceInfo'],
            'distance':
            data['detail']['activityShopInfoList'][0]['distance'],
            'score':
            data['detail']['activityShopInfoList'][0]['shopPower'],
            'shopId':
            data['detail']['activityShopInfoList'][0]['shopId'],
            'shopName':
            data['detail']['activityShopInfoList'][0]['shopName'],
            'shopType':
            data['detail']['activityShopInfoList'][0]['shopType'],
        }

        if len(data['detail']['offlineActivityTagDTOList']) > 0:
            activityInfo['tagId'] = data['detail'][
                'offlineActivityTagDTOList'][0]['tagId']
            activityInfo['tagName'] = data['detail'][
                'offlineActivityTagDTOList'][0]['tagName']
        else:
            activityInfo['tagId'] = 0
            activityInfo['tagName'] = ''

        like = predict(features)
        applyed = response.meta['applyed']

        activityInfo['like'] = like

        if like == 1:
            if applyed:
                print('applyed ' + activityInfo['shopName'])
                item = DianpingItem()
                item['id'] = activityInfo['id']
                item['title'] = activityInfo['title']
                item['cost'] = activityInfo['cost']
                item['shopAddress'] = activityInfo['shopAddress']
                item['distanceInfo'] = activityInfo['distanceInfo']
                item['distance'] = activityInfo['distance']
                item['score'] = activityInfo['score']
                item['shopName'] = activityInfo['shopName']
                item['shopType'] = activityInfo['shopType']
                item['tagId'] = activityInfo['tagId']
                item['tagName'] = activityInfo['tagName']
                item['like'] = activityInfo['like']
                item['apply_result'] = '成功'

                yield item
            else:
                yield self.requestGetPreApply(activityInfo)
Esempio n. 16
0
    def parse(self, response):
        item = DianpingItem()

        sel = Selector(response)
        sites = sel.xpath('//div[@id="shop-all-list"]/ul/li')
        for site in sites:
            title = site.xpath('div[2]/div[1]/a[1]/h4/text()').extract()
            item['shopname'] = title[0]
            print title[0]

            link = site.xpath('div[2]/div[1]/a[1]/@href').extract()
            item['shopurl'] = 'http://www.dianping.com' + str(link[0])
            print 'http://www.dianping.com' + str(link[0])

            shoplevels = site.xpath('div[2]/div[2]/span/@title').extract()
            item['shoplevel'] = shoplevels[0]

            reviewnums = site.xpath('div[2]/div[2]/a[1]/b/text()').extract()
            if len(reviewnums) > 0:
                item['reviewnum'] = reviewnums[0]
            else:
                item['reviewnum'] = '0'

            avgcost = site.xpath('div[2]/div[2]/a[2]/b/text()').extract()
            if len(avgcost) > 0:
                #print avgcost[0]
                #print avgcost[0].lstrip('¥')
                #print int(avgcost[0].lstrip('¥'))
                #item['avgcost'] = avgcost[0]
                item['avgcost'] = int(avgcost[0].lstrip('¥'))
            else:
                item['avgcost'] = '0'

            tastes = site.xpath('div[2]/span/span[1]/b/text()').extract()
            if len(tastes) > 0:
                item['taste'] = tastes[0]
            else:
                item['taste'] = '0'

            envs = site.xpath('div[2]/span/span[2]/b/text()').extract()
            if len(envs) > 0:
                item['env'] = envs[0]
            else:
                item['env'] = '0'

            services = site.xpath('div[2]/span/span[3]/b/text()').extract()
            if len(services) > 0:
                item['service'] = services[0]
            else:
                item['service'] = '0'

            foodtypes = site.xpath('div[2]/div[3]/a[1]/span/text()').extract()
            item['foodtype'] = foodtypes[0]

            location = site.xpath('div[2]/div[3]/a[2]/span/text()').extract()
            item['location'] = location[0]

            yield item

        nextLink = site.xpath(
            '//div[@class="page"]/a[last()]/@data-ga-page').extract()
        print '++++++++++++++++++++++++++++++++++++++++++++++'
        print nextLink

        if nextLink:
            print nextLink[0]
            nextLink = 'http://www.dianping.com/search/category/12/10/o3p' + nextLink[
                0]
            #reallink = str(response.url)
            print nextLink
            #reallink = nextLink
            yield Request(nextLink, headers=self.headers)