コード例 #1
0
def get_all_province():
    """ 获取区域
    从北京站获取的省和直辖市是全面的,获取每个省和直辖市的名字,每个省和直辖市的品牌是不一样的,
    本来是从每个品牌商家列表上边的区域位置按钮弹层中获得该品牌的覆盖的地区,但是有的品牌不支持这样,
    所以对这些品牌应先获得所有的地区,然后与各品牌拼接url。顺便写入数据库

    以上两种方法结合互补执行

    :return:
    """
    bhtml = general_helper.get_response(
        'http://dealer.bitauto.com/beijing/audi/')
    bsoup = BeautifulSoup(bhtml, 'lxml')
    provincelist = bsoup.find('ul', 'layer-txt-list').find_all('li')
    plist = []
    for province in provincelist:
        p = {}
        p['url'] = re.findall(r'(?<=href=\").*?(?=\">)',
                              str(province))[0].decode('utf-8')
        p['name'] = re.findall(r'(?<=0\">).*?(?=<)',
                               str(province))[0].decode('utf-8')
        p['show'] = p['url'].split('/')[1].decode('utf-8')
        p['num'] = re.findall(r'(?<=\().*?(?=\))', str(province))[0]
        now_time = general_helper.get_now()
        sql = u"insert into province (`name`,`show`,`url`,`create_time`)\
           values ( %s,%s,%s,%s)"

        params = (p['name'], p['show'], p['url'], now_time)
        mysql.insert(sql, params)
        plist.append(p)
    return plist
コード例 #2
0
def get_province(bshow):
    """ 从数据库中读取省直辖市名称与品牌名称构建url,若有经销商则返回该省

    :return:
    """
    sql = u"select distinct `name`, `show` from province"
    relist = mysql.select(sql)
    # closelink(cur,conn)
    plist = []
    for i in range(len(relist)):
        # print relist[i][0]
        para = '/' + relist[i][1] + '/' + bshow + '/?BizModes=0'
        lurl = general_helper.build_url(main_url, para)
        # print lurl
        html = general_helper.get_response(lurl)
        soup = BeautifulSoup(html, 'lxml')
        dealerbox = soup.find('div', 'main-inner-section sm dealer-box')
        dealerlist = dealerbox.find_all('div', 'row dealer-list')
        if len(dealerlist) == 0:
            # print relist[i][0]+'has not dealer'
            continue
        else:
            p = {'name': relist[i][0], 'show': relist[i][1]}
            plist.append(p)
            # print relist[i][0],relist[i][1]+' has dealer'
    return plist
コード例 #3
0
def get_all_dealer(brand_list):
    """ 根据品牌获取品牌覆盖地区再获取商家信息
    :param brand_list
    :return:空。数据存入
    """
    reload(sys)
    sys.setdefaultencoding("utf-8")
    # conn,cur=Linksql('192.168.10.71','datacrowler','1qazXSW@','PriceCrawlerDB')
    conn = None
    cur = None
    for brand in brand_list:  # 每个品牌
        logger.debug('crawling: %s' % brand['name'])
        mbrandname = brand['mainbrand']
        mbrandid = brand['mainid']
        mbrandshow = brand['mainshow']
        bname = brand['name']
        bshow = brand['show']
        if brand['num'] == 0:  # 品牌后数字为0即该品牌没有商家
            continue
        else:
            logger.info("get dealers of %s, %s, %s" %
                        (mbrandname, bname, brand['num'] or 0))
            burl = general_helper.build_url(main_url, brand['url'])
            bhtml = general_helper.get_response(burl)
            bsoup = BeautifulSoup(bhtml, 'lxml')
            try:  # 有的品牌无法从商家列表上边的区域位置按钮弹层中获得该品牌的覆盖的地区,会抛出异常
                plist = get_location(bsoup, 'ul', 'layer-txt-list')
            except Exception, e:
                plist = get_province(main_url, bshow)  # 此时采取第二种方案
                logger.critical(
                    "%s, %s, %s" %
                    (bname, len(plist), ' this brand don\'t have dealer'))
                raise
            # brand['location']=[]
            if len(
                    plist
            ) == 0:  # 有的品牌无法从商家列表上边的区域位置按钮弹层中获得该品牌的覆盖的地区,会抛出异常,虽然经过方案二的处理但是有的品牌下没有覆盖省及直辖市,此时plist=[]
                continue
            else:
                for p in plist:
                    pname = p['name']
                    purl = p['url']
                    pshow = p['show']

                    pnum = 0
                    try:
                        pnum = int(p['num'])
                    except Exception, e:
                        logger.critical(e.message)
                        raise
                    finally:
                        pnum = 0
コード例 #4
0
def get_dealer_telephone(dealer_id):
    """ 由商家ID获取商家电话,输入商家id为字符串

    :param dealer_id:
    :return:
    """
    url = 'http://autocall.bitauto.com/eil/das2.ashx?userid=' + str(
        dealer_id) + '&mediaid=10&source=bitauto'
    response = general_helper.get_response(url, False)
    response = response.text
    telstr = re.findall(r'(?<=tel\"\:").*?(?=\")', str(response))
    if not telstr:
        tel = None
    else:
        tel = telstr[0].decode('utf-8')
    return tel
コード例 #5
0
ファイル: yiche_car.py プロジェクト: cauwt/CarDataCrawler
def get_and_insert_car(brand_serial_list):
    """根据车型ID获取车款信息,以及补充车型信息
    可以得到以下信息
    serial_spell,
    serial_show_name,

    car_id,
    car_name,
    car_gear,
    car_engine,
    car_msrp,
    car_sale_year

    :param brand_serial_list:
    :return:主品牌品牌车型车款列表
    """
    serial_url_base = 'http://car.bitauto.com/tree_chexing/sb_@serial_id'
    for brand in brand_serial_list:
        logger.debug('brand: %s' % (brand['brand_name']))
        brand_serial_car_list = []
        for serial in brand['serial']:
            logger.debug('serial: %s' % (serial['serial_name']))
            serial_id = serial['serial_id']
            serial_url = serial_url_base.replace('@serial_id', str(serial_id))
            logger.debug('url: %s' % serial_url)
            content = general_helper.get_response(serial_url)
            html = etree.HTML(content)
            serial_spell = str(
                html.xpath(
                    '//div[@class="section-header header1"]/div/h2/a/@href')
                [0])[1:-1]
            serial_show_name = str(
                html.xpath(
                    '//div[@class="section-header header1"]/div/h2/a/text()')
                [0]).decode('utf-8')
            car_row_list = html.xpath('//table[@id="compare_sale"]/tbody/tr')
            for car_row in car_row_list:
                if 'class' in car_row.attrib and car_row.attrib[
                        'class'] == 'table-tit':  # 分组表头
                    car_engine = str(
                        car_row.xpath(
                            'normalize-space(th[@class="first-item"])')
                    ).decode('utf-8')
                else:  # 车款
                    car_id = int(
                        re.search(r'\d+',
                                  car_row.attrib['id']).group().strip())
                    car_name = str(car_row.xpath('td/a/text()')
                                   [0]).strip().decode('utf-8')
                    car_gear = str(
                        car_row.xpath('string(td[3])')).strip().decode('utf-8')
                    car_msrp_match = re.search(
                        r'(\d+(\.\d+)?)',
                        str(
                            car_row.xpath('string(td[@class="txt-right"]/span)'
                                          )).strip())
                    car_msrp = car_msrp_match.group(
                    ) if car_msrp_match else 0.0

                    car_sale_year = re.search(r'^\d+', car_name).group() or ''
                    brand_serial_car = {
                        'main_brand_id': brand['main_brand_id'],
                        'main_brand_name': brand['main_brand_name'],
                        'brand_id': brand['brand_id'],
                        'brand_name': brand['brand_name'],
                        'serial_id': serial['serial_id'],
                        'serial_name': serial['serial_name'],
                        'serial_spell': serial_spell,
                        'serial_show_name': serial_show_name,
                        'car_id': car_id,
                        'car_name': car_name,
                        'car_gear': car_gear,
                        'car_engine': car_engine,
                        'car_msrp': car_msrp,
                        'car_sale_year': car_sale_year
                    }
                    brand_serial_car_list.append(brand_serial_car)
        insert_car_to_db(brand_serial_car_list)
コード例 #6
0
def get_dealer(lurl, location):
    """从一个品牌区域的url获取商家信息

    :param lurl:
    :param location:
    :return:
    """
    html = general_helper.get_response(lurl)
    # print html.encode('gbk','ignore')
    soup = BeautifulSoup(html, 'lxml')
    # print 'begin get dealer'
    dealerbox = soup.find('div', 'main-inner-section sm dealer-box')
    dealerlist = dealerbox.find_all('div', 'row dealer-list')
    # print dealerlist
    for dealer in dealerlist:
        inf = dealer.find('div', 'col-xs-6 left')
        name = inf.find('h6', 'title-4s').find('a')  # 标题
        dtype = name.find('em').string  # 商家类型
        # print dtype
        durl = re.findall(r'(?<=href=\").*?(?=\")', str(name))[0]  # 商家url
        # print durl
        dname = re.findall(r'(?<=span>).*?(?=<)',
                           str(name))[0].decode('utf-8')  # 商家名称
        # print dname
        dealer_id = int(re.findall(r'(?<=com/)\d+(?=/)', str(durl))[0])  # 商家ID
        # print dealer_id

        dpinpai = re.findall(r'(?<=span\>).*?(?=\<)',
                             str(inf.find('p', 'brand')))[0].decode(
                                 'utf-8')  # 商家主营品牌
        # print dpinpai
        if inf.find('p', 'promote') != None:  # 判断商家是否有正在进行的降价
            dpromotetitle = inf.find(
                'p', 'promote').find('a').string.decode('utf-8').replace(
                    r'\s+', u' ')
            dpromoteurl = re.findall(
                r'(?<=href=\").*?(?=\")',
                str(inf.find('p', 'promote').find('a')))[0].decode('utf-8')
            dpromoteday = inf.find('p', 'promote').find(
                'span', 'time').string.decode('utf-8')
        else:
            dpromotetitle = None  # 商家正在进行的降价标题
            dpromoteurl = None  # 降价新闻的url
            dpromoteday = None  # 剩余天数
        # print dpromotetitle
        # print dpromoteurl
        # print dpromoteday
        add = inf.find('p', 'add').find_all(
            'span', attrs={'title':
                           True})[0].attrs['title'].replace(u'\xa0',
                                                            u'')  # 商家地址
        # print add.encode('gbk','ignore')
        tel = get_dealer_telephone(dealer_id)
        dtel = tel  # 商家电话
        # print dtel.encode('gbk','ignore')
        try:
            dsalearea = inf.find('p', 'tel').find('span',
                                                  'sales-area').string  # 售卖地区
        except Exception, e:
            print lurl, dname, location['pname'], location[
                'mainbrand'], location['bname'], inf.find('p', 'tel')
            raise
        dcity = dealer.find('div',
                            'col-xs-7 middle').p.string.split(' ')[0]  # 所在城市
        dlocation = dealer.find(
            'div',
            'col-xs-7 middle').p.string.split(' ')[1].replace('&nbsp;',
                                                              '')  # 所在地区
        now_time = general_helper.get_now()
        logger.debug("%s,%s,%s,%s,%s" %
                     (location['pname'], dcity, dlocation, dealer_id, dname))
        sql = u"insert into dealer_raw(" \
              u"`main_brand_id`,`main_brand_name`,`main_brand_show`,`brand_name`,`brand_show`,`province_name`,`province_show`," \
              u"`city_name`,`location_name`,`dealer_type`,`dealer_url`,`dealer_name`,`dealer_id`,`dealer_brand`,`dealer_pro_title`," \
              u"`dealer_pro_url`,`dealer_pro_day`,`dealer_add`,`dealer_tel`,`sale_area`,`url`,`create_time`" \
              u") values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        params = (location['mainid'], location['mainbrand'],
                  location['mainshow'], location['bname'], location['bshow'],
                  location['pname'], location['pshow'], dcity, dlocation,
                  dtype, durl.decode('utf-8'), dname, dealer_id, dpinpai,
                  dpromotetitle or '', dpromoteurl
                  or '', dpromoteday or '', add, dtel or '', dsalearea,
                  lurl.decode('utf-8'), now_time)
        # print sql
        try:
            mysql.insert(sql, params)
        except Exception, e:
            print 'this is an except:', str(e)
            print sql
            print location['mainid'], location['mainbrand'], location[
                'mainshow'], location['bname'], location['bshow'], location[
                    'pname'], location['pshow']
            print dcity, dlocation
            print dtype, durl, dname, dealer_id
            print dpinpai
            print dpromotetitle, dpromoteurl, dpromoteday
            print add.encode('gbk', 'ignore'), dtel, dsalearea
            print lurl
            raise
コード例 #7
0
                        'mainshow': mbrandshow,
                        'bname': bname,
                        'bshow': bshow,
                        'pname': pname,
                        'pshow': pshow,
                        'pnum': pnum
                    }
                    # location['purl']=purl
                    if int(
                            p['num']
                    ) <= 10:  # 如果全省商家不超过10个就不需要往下找市区县,商家列表10个商家一页,对于超过10个的多页会有重复商家,导致抓取到的商家有漏
                        # print p['name'],' has <10 dealer'
                        get_dealer(purl, location)
                        continue
                    else:
                        phtml = general_helper.get_response(purl)
                        psoup = BeautifulSoup(phtml, 'lxml')
                        # clist=[]

                        clist = get_city(p, psoup)  # 获取省下属市
                        for c in clist:
                            if c['name'] == u'不限':
                                continue
                            else:
                                # print c['name']
                                cname = c['name']
                                curl = c['url']
                                cnum = c['num']
                                cshow = c['show']
                                c_url = general_helper.build_url(
                                    main_url, c['url'])
コード例 #8
0
# description:
# ----------------------------------------------------------------------------------------------------------------------

import sys
from bs4 import BeautifulSoup
import re
from lxml import etree

from spiders.yiche_car import insert_car_to_db
from utils import general_helper

if __name__ == '__main__':
    reload(sys)
    sys.setdefaultencoding('utf8')
    serial_url = 'http://car.bitauto.com/tree_chexing/sb_1608'
    content = general_helper.get_response(serial_url)
    html = etree.HTML(content)
    serial_spell = str(
        html.xpath('//div[@class="section-header header1"]/div/h2/a/@href')
        [0])[1:-1]
    serial_show_name = str(
        html.xpath('//div[@class="section-header header1"]/div/h2/a/text()')
        [0]).decode('utf-8')
    car_row_list = html.xpath('//table[@id="compare_sale"]/tbody/tr')
    brand_serial_car_list = []
    for car_row in car_row_list:
        if 'class' in car_row.attrib and car_row.attrib[
                'class'] == 'table-tit':  # 分组表头
            car_engine = str(
                car_row.xpath('normalize-space(th[@class="first-item"])')
            ).decode('utf-8')