Python get Beispiele, util.request.get Python Beispiele

Beispiel #1

0

Datei anzeigen

def parse():
    response = request.get(start_url)
    if response:
        html = etree.HTML(response.content)
        urls = html.xpath("//div[@class='link-n clearfix']/a/@href")
        for url in urls:
            response = request.get(url)
            get_airport(response)

Beispiel #2

0

Datei anzeigen

def parse():
    response = request.get(openrice_url)
    if response:
        html = etree.HTML(response.content)
        districts = html.xpath(
            "//div[@class='flex-wrap js-flex-wrap']/div[contains(@class, 'btn')]/@data-tag"
        )
        for district in districts[94:167]:
            print('%s---开始' % district)
            for i in range(1, 18):
                if district.startswith('所有'):
                    continue
                district = district.replace(' (', '-').replace(')', '')
                url = openrice_cuisine_url % (district, i)
                response = request.get(url)
                if response:
                    html = etree.HTML(response.content)
                    shops = html.xpath("//div[@class='content-cell-wrapper']")
                    for shop in shops:
                        shop_item = item.copy()
                        name = shop.xpath(
                            ".//h2[@class='title-name']/a/text()")[0]
                        if exist(name):
                            print(name + '已经存在')
                            continue
                        recomm_dish = get_node_list(
                            shop.xpath(
                                ".//ul[@class='dish-list']/li[@class='dish']/text()"
                            ))
                        book_status = get_book_status(
                            shop.xpath(
                                ".//a[@class='info-button info-offer-button']/text()"
                            ))
                        preferential = get_node_list(
                            shop.xpath(
                                ".//span[@class='info-text info-offer-text']/text()"
                            ))
                        shop_url = shop.xpath(
                            ".//h2[@class='title-name']/a/@href")[0]
                        shop_item['vendor_name'] = name
                        shop_item['recomm_dish'] = recomm_dish
                        shop_item['book_status'] = book_status
                        shop_item['preferential'] = handle_emjoy(preferential)
                        shop_detail(shop_url, shop_item)
                    print('第%s页完成' % i)
                else:
                    print('请求失败%s' % url)

            print('%s完成' % district)

Beispiel #3

0

Datei anzeigen

def get_business_hours(shop_id):
    url = 'https://www.openrice.com/api/poi/status?uiCity=hongkong&uiLang=zh-cn&poiId=%s' % shop_id
    response = request.get(url)
    if response:
        opening_info = {}
        business_hours = json.loads(response.content)
        opening_info['business_hours'] = business_hours
        open_tiem = set()
        if business_hours:
            for item in business_hours.get('openingHourInfo',
                                           {}).get("normalHours", []):
                for i in item.get('times', {}):
                    time_list = i['timeDisplayString'].split('-')
                    if len(time_list) > 1:
                        start, end = time_list
                        if end == '00:00':
                            end = '24:00'
                        open_tiem.update([start, end])
                        time_digit = [
                            float(i.replace(':', '.')) for i in open_tiem
                        ]
                        opening_info['open_time'] = str(
                            min(time_digit)).replace('.', ':')
                        opening_info['close_time'] = str(
                            max(time_digit)).replace('.', ':')
        return opening_info

Beispiel #4

0

Datei anzeigen

def get_shop_url(url, name, headers):
    response = request.get(url, headers=headers)
    if response:
        html = etree.HTML(response.content)
        shop_names = html.xpath("//div[@class='txt']/div[@class='tit']/a")
        # and shop_names[0].xpath('h4/text()')[0] == name
        if shop_names:
            title = shop_names[0].xpath('h4/text()')[0]
            shop_url = shop_names[0].xpath('@href')[0]
            avg_price = html.xpath("//a[@class='mean-price']/b/text()")[0] \
                    if html.xpath("//a[@class='mean-price']/b/text()") else ''
            cuisine = html.xpath("//span[@class='tag']/text()")[0] \
                    if html.xpath("//span[@class='tag']") else ''
            tag_addr = html.xpath("//div[@class='tag-addr']/a")
            if len(tag_addr) > 2:
                tag_addr = tag_addr[1].xpath("span[@class='tag']/text()")[0]
            else:
                tag_addr = ''
            addr = html.xpath("//div[@class='tag-addr']/span[@class='addr']/text()")[0] \
                    if html.xpath("//div[@class='tag-addr']/span[@class='addr']/text()") else ''
            preferences = html.xpath("//div[@class='svr-info']/a[contains(@class,'tuan')]/text()")[0] \
                    if html.xpath("//div[@class='svr-info']/a[contains(@class,'tuan')]") else ''

            item = {
                'title': title,
                'shop_url': shop_url,
                'avg_price': avg_price,
                'cuisine': cuisine,
                'tag-addr': tag_addr,
                'addr': addr,
                'preferences': preferences
            }
            print(item)

Beispiel #5

0

Datei anzeigen

Datei: lagou.py Projekt: zhenglanfang/spider_project

def get_json(type,kw,url):
    params = ''
    response = request.get(url,params=params)
    if response:
        return response.json()
    else:
        return None

Beispiel #6

0

Datei anzeigen

Datei: miqilin_hongkong2.py Projekt: zhenglanfang/spider_project

def parse(item):
    url = ctrip_search_shop_url % item['vendor_name']
    response = request.get(url)
    if response:
        html = etree.HTML(response.content)
        box = html.xpath("//div[@class='rdetailbox']")
        if box:
            box = box[0]
            name = box.xpath("./dl/dt/a/text()")[0].decode('utf-8').lower()
            name = switch_lang.Traditional2Simplified(name)
            address = box.xpath("./dl/dd[@class='ellipsis']/text()")[0].decode('utf-8').lower()
            address = switch_lang.Traditional2Simplified(address)
            check_name = item['vendor_name'].lower()
            check_name = switch_lang.Traditional2Simplified(check_name)
            check_address = item['address'].lower()
            check_address = switch_lang.Traditional2Simplified(check_address)
            item['ctrip_address'] = address
            if name.find(check_name) == -1:
                return
            if address.find(check_address) == -1:
                flag = False
            else:
                flag = True
            shop_url = box.xpath("./dl/dt/a/@href")[0]
            get_shop_detail(shop_url, item, flag)
    else:
        print('请求失败：%s'%url)

Beispiel #7

0

Datei anzeigen

Datei: lagou.py Projekt: zhenglanfang/spider_project

def get_positons_list(url,item,cookies):
    response = request.get(url,cookies=cookies)
    if response:
        html = etree.HTML(response.content)
        if html.xpath('//title/text()')[0] == '找工作-互联网招聘求职网-拉勾网':
            print(url + '  error ')
            return
        get_positions_urls(response,item,cookies)
        html = etree.HTML(response.content)
        page_num = html.xpath("//span[@class='span totalNum']/text()")
        page_num = int(page_num[0]) if page_num  else 1
        if page_num > 1:
            for num in range(2,page_num+1):
                list_url = '%s%d/'%(url,num)
                response = request.get(list_url,cookies=cookies)
                get_positions_urls(response,item,cookies)

Beispiel #8

0

Datei anzeigen

Datei: lagou.py Projekt: zhenglanfang/spider_project

def get_positions_urls(response,item,cookies):
    if response:
        html = etree.HTML(response.content)
        print(html.xpath('//title/text()')[0] if html.xpath('//title/text()') else 'title error')
        item_list = html.xpath("//ul[@class='item_con_list']/li")
        for position in item_list:
            publish_date = position.xpath(".//span[@class='format-time']/text()")[0]
            publish_date = switch_publish_date(publish_date)
            url = position.xpath(".//a[@class='position_link']/@href")[0]
            # 判断url是否存在
            if not db_operate.isexist_url(url):
                position_name = position.xpath("@data-positionname")[0]
                salary = position.xpath("@data-salary")[0]
                other = position.xpath(".//div[@class='li_b_l']/text()")[2].strip()
                add = position.xpath(".//span[@class='add']/em/text()")[0]
                city = add.split('·')[0]
                company_name = position.xpath("@data-company")[0]
                item['position_name'] = position_name
                item['publish_date'] = publish_date
                item['salary'] = salary
                item['education'] = other.split('/')[1]
                item['work_year'] = other.split('/')[0][2:]
                item['city'] = city
                item['company_name'] = company_name
                item['url'] = url
                response = request.get(url,cookies=cookies)
                get_position_detail(response,item)
            else:
                print('此url%s已经存在！'%url)

Beispiel #9

0

Datei anzeigen

Datei: miqilin_quna.py Projekt: zhenglanfang/spider_project

def parse():
    for page in range(1000, 3300):
        print('%s页---开始' % page)
        response = request.get(quna_url % page)
        if response:
            html = etree.HTML(response.content)
            shop_list = html.xpath(
                "//ul[@class='list_item clrfix']/li[@class='item']")
            for shop in shop_list:
                shop_item = item.copy()
                name = shop.xpath(".//span[@class='cn_tit']/text()")[0]
                score_details = shop.xpath(
                    "//span[@class='cur_score']/text()")[0]
                if exist(name):
                    print(name + '已经存在')
                    continue
                sub_info = shop.xpath(
                    ".//div[@class='sublistbox']/dl[@class='sublist_item clrfix']"
                )
                get_sub_info(sub_info, shop_item)
                shop_url = shop.xpath("./a[@data-beacon='poi']/@href")[0]
                shop_item['vendor_name'] = name
                shop_item['score_details'] = score_details
                shop_item['vendor_url'] = shop_url
                shop_detail(shop_url, shop_item)
            print('第%s页完成' % page)
        else:
            print('请求失败%s' % shop_url)

Beispiel #10

0

Datei anzeigen

def start_spider():
    response = request.get(start_url)
    if response:
        html = etree.HTML(response.content)
        name = names.next()
        headers['Cookie'] = headers['Cookie'] + str(start_num.next())
        for name in names:
            get_shop_url(second_url % name, name, headers)
    else:
        print(response)

Beispiel #11

0

Datei anzeigen

def get_dish(city, shop_id):
    dish_url = 'http://www.dianping.com/overseas/shop/ajax/allReview?categoryURLName=food&power=5&shopType=10\
    &shopId=%s&cityId=%s&cityEnName=%s' % (shop_id, city.get('code'),
                                           city.get('name'))
    response = request.get(dish_url)
    body = json.loads(response.text)
    dish = body.get('dishTagStrList', [])
    if dish == None:
        dish = []
    dish = ';'.join(dish)
    print(dish)

Beispiel #12

0

Datei anzeigen

Datei: miqilin_price.py Projekt: zhenglanfang/spider_project

def get_dianping_city():
    response = request.get(city_list_url)
    if response:
        citys = json.loads(response.content)
        with open('data/dianping_city.json','w') as f:
            f.write(json.dumps(citys,ensure_ascii=False,indent=2))
        city_dict = {}
        for i in citys.get('cityMap').values():
            for city in i:
                city_dict[city.get('cityName')] = city.get('cityId')
        return city_dict

Beispiel #13

0

Datei anzeigen

def tripadvisor(item):
    url = tripadvisor_url % item['vendor_name']
    response = request.get(url)
    if response:
        html = etree.HTML(response.content)
        title = html.xpath("//div[@class='title']/span/text()")
        address = html.xpath("//div[@class='address']/text()")
        if title and address:
            name = item['vendor_name'].lower()
            address2 = item['address'].lower().replace('楼', '')
            if title[0].lower().find(name) == -1 or address[0].lower().find(
                    address2) == -1:
                pass
            else:
                url = html.xpath("//div[@class='title']/@onclick")
                url = re.findall(r'\'(.*?)\'', url[0])[3]
                url = tripadvisor_host + url
                response = request.get(url)
                if response:
                    html = etree.HTML(response.content)
                    rows = html.xpath("//div[@class='row']")
                    for row in rows:
                        row_title = row.xpath("./div[@class='title']/text()")
                        if row_title:
                            row_title = row_title[0].strip()
                        row_content = row.xpath(
                            "./div[@class='content']//text()")
                        row_content = handle_node(row_content)
                        if row_title == '菜系':
                            item['tripadvisor_cuisine'] = row_content
                        elif row_title == '餐厅特点':
                            item['tripadvisor_character'] = row_content
                        elif row_title == '就餐氛围':
                            item['environment'] = row_content

                    item['tripadvisor_url'] = url
                else:
                    print('请求失败' + url)
    else:
        print('请求失败' + url)
    save_date(item)

Beispiel #14

0

Datei anzeigen

Datei: miqilin_price.py Projekt: zhenglanfang/spider_project

def update_price():

    # sql1 = "select vendor_id,name,price from vendor_miqilin where vendor_city='香港'"
    # result_raws = dbmysql.fetchall(sql1)
    # print(len(result_raws))
    # return
    # for item in result_raws:
    #     sql2 = "select price from price where name=:name"
    #     price = dbmysql.first(sql2,params={'name':item[1]})
    #     if price:
    #         pass
    #         # sql3 = "update vendor_miqilin set price = :price where vendor_id=:vendor_id"
    #         # dbmysql.edit(sql3,params={'price':price[0],'vendor_id':item[0]})
    #     else:
    #         print(item[1])


    # sql = "select name,price from price"
    # names = dbmysql.fetchall(sql)
    # for item in names:
    #     sql2 = "select name,price from vendor_miqilin where vendor_city='纽约' and name=:name"
    #     flag = dbmysql.first(sql2,params={'name':item[0]})
    #     if not flag:
    #         print (str(item[0])+':'+str(item[1]))
    

    # 香港
    sql1 = "select vendor_id,name,vendor_url,vendor_city from vendor_miqilin_price"
    results = dbmysql.fetchall(sql1)
    print(len(results))
    city_list = get_dianping_city()
    city_list[u'纽约'] = 2395
    city_list[u'伦敦'] = 2464
    # print([item[0] for item in results])
    for item in results:
        shop_url = item[2]
        shop_id = shop_url.split('/')[-1]
        city_code = city_list.get(item[3])
        avg_price_url = 'http://www.dianping.com/overseas/shop/ajax/reviewAndStar?shopId=%s&cityId=%s\
&mainCategoryId=102' % (shop_id,city_code)
        response = request.get(avg_price_url)
        price = get_avg_price(response)
        # sql2 = "select name,price from price_detail where name=:name"
        # price = dbmysql.first(sql2,{'name':item[1]})
        if price:
            print(price)
            sql3 = "update vendor_miqilin_price set price = :price,flag = 0 where vendor_id=:vendor_id"
            dbmysql.edit(sql3, params={'price': price, 'vendor_id': item[0]})
        else:
            sql3 = "update vendor_miqilin_price set flag = 1 where vendor_id=:vendor_id"
            dbmysql.edit(sql3, params={'vendor_id': item[0]})
            print(str(item[0]) + ' ' + item[1])

Beispiel #15

0

Datei anzeigen

def get_dish(url, item, headers, cookies):
    '''
    :param response:
    :return: 获取菜单
    '''
    response = request.get(url, headers=headers, cookies=cookies)
    body = json.loads(response.text)
    dish = body.get('dishTagStrList', [])
    if dish == None:
        dish = []
    dish = ';'.join(dish)
    item['dish'] = dish
    return item

Beispiel #16

0

Datei anzeigen

Datei: miqilin_quna.py Projekt: zhenglanfang/spider_project

def shop_detail(url, item):
    response = request.get(url)
    if response:
        html = etree.HTML(response.content)
        description = handle_node(
            html.xpath("//div[@class='e_db_content_box']/p/text()"))
        business_hours = handle_node(
            html.xpath("//dl[@class='m_desc_right_col']/dd/span/p/text()"))
        phone = handle_node(
            html.xpath("//td[@class='td_l']/dl[2]/dd/span/text()"))
        item['description'] = description
        item['business_hours'] = business_hours
        item['phone'] = phone
    else:
        print('请求失败%s' % url)
    save_date(item)

Beispiel #17

0

Datei anzeigen

def get_shop(url, item, cookies, city):
    response = request.get(url, headers=headers, cookies=cookies)
    if response:
        html = etree.HTML(response.text)
        names = html.xpath("//div[@class='tit']/a/@title")
        name = names[0] if names else ''
        if name == item['name']:
            shop_url = html.xpath("//div[@class='tit']/a/@href")
            if shop_url:
                shop_url = shop_url[0]
                shop_id = shop_url.split('/')[-1]
                item['url'] = shop_url
                # headers['Referer'] = shop_url
                # shopinfo_url = 'http://www.dianping.com/ajax/json/shopDynamic/basicHideInfo?shopId=%s'%shop_id
                get_shopinfo(shop_url, item, response.cookies, city, shop_id)
                return
    save(item)

Beispiel #18

0

Datei anzeigen

Datei: miqilin_hongkong2.py Projekt: zhenglanfang/spider_project

def get_shop_detail(url, item, flag):
    url = ctrip_host + url
    response = request.get(url)
    if response:
        html = etree.HTML(response.content)
        description = get_one(html.xpath("//div[@itemprop='description']/text()"))
        dish = get_shop_dish(html.xpath("//div[@class='text_style']/p/text()"))
        price = get_one(html.xpath("//em[@class='price']/text()")).replace('¥','')
        if flag:
            item['description'] = description
        else:
            item['ctrip_description'] = description
        item['ctrip_dish'] = switch_lang.Traditional2Simplified(dish)
        item['ctrip_url'] = url
        item['ctrip_price'] = price
        save_data(item)
    else:
        print('shop 请求失败%s'%url)

Beispiel #19

0

Datei anzeigen

def update_latlng():
    sql = 'select * from vendor_miqilin_hongkong where vendor_id'
    data = dbmysql.fetchall(sql)
    for item in data:
        address = item['address']
        result = request.get(baidu_address%(address,'香港'))
        result = json.loads(result.content)
        if result.get('status') == 0:
            update_sql = 'update vendor_miqilin set lat=:lat,lng=:lng where vendor_id=:vendor_id'
            params = {
                'lat':result['result']['location'].get('lat'),
                'lng':result['result']['location'].get('lng'),
                'vendor_id': item['vendor_id']
            }
            dbmysql.edit(update_sql, params)
        else:
            update_sql = 'update vendor_miqilin set lat_flag = 1 where vendor_id=:vendor_id'
            dbmysql.edit(update_sql,{'vendor_id': item['vendor_id']})
            print(str(item['vendor_id']))

Beispiel #20

0

Datei anzeigen

Datei: lagou.py Projekt: zhenglanfang/spider_project

def start_spider():
    response = request.get(start_url)
    if response:
        cookies = response.cookies
        html = etree.HTML(response.content)
        menu = html.xpath("//div[@class='menu_sub dn']")[0]
        positions_dict = {}
        types = menu.xpath("dl")
        for item in types:
            type_name = item.xpath("dt/span/text()")[0]
            # print(type_name)
            positions_dict[type_name] = {}
            positions = item.xpath("dd/a")
            for position in positions:
                position_name = position.text
                position_url = position.xpath('@href')[0]
                positions_dict[type_name][position_name] = position_url
                position_data = {'first_type':position_name,'second_type':type_name}
                get_positons_list(position_url,position_data,cookies)

Beispiel #21

0

Datei anzeigen

Datei: miqilin_price.py Projekt: zhenglanfang/spider_project

def parse():
    for i in range(1,24):
        url = start_url + '?page=%d'%i
        response = request.get(url)
        if response:
            html = etree.HTML(response.content)
            shops = html.xpath("//li[@class='poi-item poi-item-restaurant']")
            for shop in shops:
                name = shop.xpath("div[@class='poi-item-name truncate']/a/text()")[0]
                row_price = shop.xpath(".//div[@class='poi-item-price']//text()")
                row_price = [i for i in row_price if i.strip()]
                price =  int(row_price[1][1:]) + int(row_price[-1][1:])
                price = int(round(price/2 * 6.3279))
                row_price = ' '.join(row_price)
                item = dict(
                    name = name,
                    price = price,
                    row_price = row_price
                )
                sql = 'insert into price(name,price,row_price) values(:name,:price,:row_price)'
                dbmysql.edit(sql,item)

Beispiel #22

0

Datei anzeigen

def shop_detail(url, item):
    url = mafengwo_host + url
    item['vendor_url'] = url
    headers = {
        'user-agent': user_agent,
    }
    response = request.get(url, headers=headers)
    if response:
        html = etree.HTML(response.content)
        description = html.xpath(
            "//div[@class='tips']/p[@style='max-height: 3.9em;overflow: hidden;']/text()"
        )
        sub_info = html.xpath(
            "//div[@class='tips']/p[@style='max-height: 2.5em;overflow: hidden;']"
        )
        get_sub_info(sub_info, item)
        address = html.xpath("//div[@class='maps']/ul[@class='context']/li")
        get_sub_info(address, item)
        item['description'] = handle_desctrip(description)
    else:
        print('请求失败%s' % url)
    save_date(item)

Beispiel #23

0

Datei anzeigen

def start_parse(city):
    response = request.get(start_url, headers=headers)
    if response:
        names = get_names(city.get('name'))
        cookies = response.cookies
        for name in names:
            url = dianping_shop_url % (city.get('code'), name)
            item = {}
            item['name'] = name
            item['links'] = url
            item['book_status'] = ''
            item['facilities'] = ''
            item['description'] = ''
            item['pyment'] = ''
            item['low_price'] = ''
            item['detail_price'] = ''
            item['price'] = ''
            item['michelin_star'] = ''
            item['cuisine'] = ''
            item['service_time'] = ''
            item['address'] = ''
            item['phone'] = ''
            item['district'] = ''
            item['landmark'] = ''
            item['quality'] = ''
            item['brand'] = ''
            item['recomm_dishes'] = ''
            item['taste'] = ''
            item['characters'] = ''
            item['category'] = ''
            item['people_group'] = ''
            item['lat'] = ''
            item['lng'] = ''
            item['country'] = ''
            item['city'] = city.get('name')
            item['other_name'] = ''
            item['dish'] = ''
            get_shop(url, item, cookies, city)

Beispiel #24

0

Datei anzeigen

def parse():
    for page in range(22, 3850):
        print('%s页---开始' % page)
        headers = {
            'user-agent': user_agent,
            'x-requested-with': 'XMLHttpRequest',
        }
        response = request.get(mafengwo_url % page, headers=headers)
        if response:
            try:
                html = json.loads(response.content)
            except Exception as e:
                print(e)
                html = {}
            html = etree.HTML(html.get('html', ''))
            shop_list = html.xpath("//section[@class='poi-list']/div")
            for shop in shop_list:
                shop_item = raw_item.copy()
                name = shop.xpath(
                    "./a[@class='poi-li']/div[@class='hd']/text()")[0]
                score_details = shop.xpath(
                    ".//div[@class='star']/span/@style")[0]
                shop_url = shop.xpath("./a[@class='poi-li']/@href")[0]
                cuisine = shop.xpath(".//p[@class='m-t']/strong/text()")
                comment = shop.xpath(".//div[@class='comment']/text()")
                if exist(name):
                    print(name + '已经存在')
                    continue
                shop_item['vendor_name'] = name
                shop_item['score_details'] = score_details.replace(
                    'width:', '').replace('%;', '')
                shop_item['cuisine'] = cuisine[0].replace(
                    '&nbsp;', '') if cuisine else ''
                shop_item['comment'] = comment[0] if comment else ''
                shop_detail(shop_url, shop_item)
            print('第%s页完成' % page)
        else:
            print('请求失败%s' % shop_url)

Beispiel #25

0

Datei anzeigen

def get_shopinfo(url, item, cookies, city, shop_id):
    response = request.get(url, headers=headers, cookies=cookies)
    # 菜单的URL shopId:  cityId=342(澳门) 341(香港) cityEnName：macau hongkong  categoryURLName=food&power=5&shopType=10
    dish_url = 'http://www.dianping.com/overseas/shop/ajax/allReview?categoryURLName=food&power=5&shopType=10\
&shopId=%s&cityId=%s&cityEnName=%s' % (shop_id, city.get('code'),
                                       city.get('name'))
    item = get_dish(dish_url, item, headers, cookies)
    if response and response.text:
        html = etree.HTML(response.text)
        address = html.xpath(
            "//div[@class='expand-info address']/span[@class='item']/text()")
        item['address'] = address[0].strip() if address else ''
        phone = html.xpath(
            "//p[@class='expand-info tel']/span[@class='item']/text()")
        item['phone'] = phone[0] if phone else ''
        intents = html.xpath("//p[@class='info info-indent']")
        for intent in intents:
            info_name = intent.xpath("span[@class='info-name']/text()")
            info = intent.xpath(".//span[@class='item']/text()")
            info_name = info_name[0] if info_name else ''
            info = info[0] if info else ''
            if info_name == '别       名：':
                item['other_name'] = info.strip()
            elif info_name == '营业时间：':
                item['service_time'] = info.strip()
            elif info_name == '餐厅简介：':
                item['description'] = info.strip()

        banner = html.xpath("//div[@class='breadcrumb']/a/text()")
        item['district'] = banner[1].strip() if len(banner) > 1 else ''
        item['cuisine'] = banner[2].strip() if len(banner) > 2 else ''
        price = html.xpath("//span[@id='avgPriceTitle']/text()")
        item['price'] = price[0].split('：')[1][:-1] if price else ''
        headers['Referer'] = url
        save(item)
    else:
        save(item)

Beispiel #26

0

Datei anzeigen

Datei: airport.py Projekt: zhenglanfang/spider_project

def parse():
    for page in range(1, 285):
        page_url = start_url % page
        time.sleep(random.randint(1, 3))
        response = request.get(page_url)
        get_airport(response)

Beispiel #27

0

Datei anzeigen

def shop_detail(url, item):
    url = openrice_host + url
    shop_id = url.rsplit('-r')[-1]
    response = request.get(url)
    if response:
        text = response.text
        html = etree.HTML(response.content)
        text = text.replace('\n', '').replace('\r', '')
        text = text.decode('utf-8')
        dish = re.search(r'主要菜式包括 (.*?), 。'.decode('utf-8'), text)
        if dish:
            dish = dish.group(1).replace(', ', ';')
            item['dish'] = dish
        application_json = re.search(r'"application/ld\+json">(.*?)</script>',
                                     text)
        if application_json:
            try:
                application_json = application_json.group(1)
                application_json = json.loads(application_json)
                cuisine = application_json.get('servesCuisine', '')
                price_range = application_json.get('priceRange', '')
                phone = application_json.get('telephone', '')
                url = application_json.get('url', '')
                lat = application_json['geo']['latitude']
                lng = application_json['geo']['longitude']
                district = application_json['address']['addressLocality']
                address = district + application_json['address'][
                    'streetAddress']
                item['cuisine'] = cuisine
                item['price_range'] = price_range
                item['price'] = handle_price(price_range)
                item['price_class'] = get_price_class(item['price'])
                item['phone'] = phone
                item['lat'] = lat
                item['lng'] = lng
                item['district'] = district
                item['address'] = address
                item['openrich_url'] = url
            except Exception as e:
                print(e)
        description = handle_node(
            html.xpath(
                "//section[@class='introduction-section']/div[@class='content js-text-wrapper']/text()"
            ))
        characters = handle_node(
            html.xpath(
                "//section[@class='good-for-section']/div[@class='content']/text()"
            ))
        payment = handle_node(
            html.xpath(
                "//div[@id='pois-filter-expandable-features']//div[@class='comma-tags']//text()"
            ))
        facilities = handle_node(
            html.xpath(
                "//span[@class='or-sprite-inline-block d_sr2_lhs_tick_desktop']/following-sibling::span/text()"
            ))
        none_facilities = html.xpath(
            "//span[@class='or-sprite-inline-block d_sr2_lhs_cross_desktop']/following-sibling::span/text()"
        )
        category = handle_node(
            html.xpath(
                "//div[@class='header-poi-categories dot-separator']/div/a[contains(@href,'/type/')]/text()"
            ))
        score_details = html.xpath(
            "//div[@class='header-score-details-right-item']")
        score_details = get_score_details(score_details)
        business_hours = get_business_hours(shop_id)
        if business_hours:
            item['business_hours'] = json.dumps(
                business_hours['business_hours'], ensure_ascii=False, indent=2)
            item['open_time'] = business_hours.get('open_time')
            item['close_time'] = business_hours.get('close_time')

        if handle_book(facilities.split(';')):
            item['book_status'] = 0

        if handle_no_book(none_facilities):
            item['book_status'] = 2

        item['description'] = description
        item['characters'] = characters
        item['payment'] = payment
        item['facilities'] = facilities
        item['category'] = category
        item['score_details'] = json.dumps(score_details,
                                           ensure_ascii=False,
                                           indent=2)
        # print(item)
        tripadvisor(item)
    else:
        print('请求失败%s' % url)