def get_all_province(): """ 获取区域 从北京站获取的省和直辖市是全面的,获取每个省和直辖市的名字,每个省和直辖市的品牌是不一样的, 本来是从每个品牌商家列表上边的区域位置按钮弹层中获得该品牌的覆盖的地区,但是有的品牌不支持这样, 所以对这些品牌应先获得所有的地区,然后与各品牌拼接url。顺便写入数据库 以上两种方法结合互补执行 :return: """ bhtml = general_helper.get_response( 'http://dealer.bitauto.com/beijing/audi/') bsoup = BeautifulSoup(bhtml, 'lxml') provincelist = bsoup.find('ul', 'layer-txt-list').find_all('li') plist = [] for province in provincelist: p = {} p['url'] = re.findall(r'(?<=href=\").*?(?=\">)', str(province))[0].decode('utf-8') p['name'] = re.findall(r'(?<=0\">).*?(?=<)', str(province))[0].decode('utf-8') p['show'] = p['url'].split('/')[1].decode('utf-8') p['num'] = re.findall(r'(?<=\().*?(?=\))', str(province))[0] now_time = general_helper.get_now() sql = u"insert into province (`name`,`show`,`url`,`create_time`)\ values ( %s,%s,%s,%s)" params = (p['name'], p['show'], p['url'], now_time) mysql.insert(sql, params) plist.append(p) return plist
def get_province(bshow): """ 从数据库中读取省直辖市名称与品牌名称构建url,若有经销商则返回该省 :return: """ sql = u"select distinct `name`, `show` from province" relist = mysql.select(sql) # closelink(cur,conn) plist = [] for i in range(len(relist)): # print relist[i][0] para = '/' + relist[i][1] + '/' + bshow + '/?BizModes=0' lurl = general_helper.build_url(main_url, para) # print lurl html = general_helper.get_response(lurl) soup = BeautifulSoup(html, 'lxml') dealerbox = soup.find('div', 'main-inner-section sm dealer-box') dealerlist = dealerbox.find_all('div', 'row dealer-list') if len(dealerlist) == 0: # print relist[i][0]+'has not dealer' continue else: p = {'name': relist[i][0], 'show': relist[i][1]} plist.append(p) # print relist[i][0],relist[i][1]+' has dealer' return plist
def get_all_dealer(brand_list): """ 根据品牌获取品牌覆盖地区再获取商家信息 :param brand_list :return:空。数据存入 """ reload(sys) sys.setdefaultencoding("utf-8") # conn,cur=Linksql('192.168.10.71','datacrowler','1qazXSW@','PriceCrawlerDB') conn = None cur = None for brand in brand_list: # 每个品牌 logger.debug('crawling: %s' % brand['name']) mbrandname = brand['mainbrand'] mbrandid = brand['mainid'] mbrandshow = brand['mainshow'] bname = brand['name'] bshow = brand['show'] if brand['num'] == 0: # 品牌后数字为0即该品牌没有商家 continue else: logger.info("get dealers of %s, %s, %s" % (mbrandname, bname, brand['num'] or 0)) burl = general_helper.build_url(main_url, brand['url']) bhtml = general_helper.get_response(burl) bsoup = BeautifulSoup(bhtml, 'lxml') try: # 有的品牌无法从商家列表上边的区域位置按钮弹层中获得该品牌的覆盖的地区,会抛出异常 plist = get_location(bsoup, 'ul', 'layer-txt-list') except Exception, e: plist = get_province(main_url, bshow) # 此时采取第二种方案 logger.critical( "%s, %s, %s" % (bname, len(plist), ' this brand don\'t have dealer')) raise # brand['location']=[] if len( plist ) == 0: # 有的品牌无法从商家列表上边的区域位置按钮弹层中获得该品牌的覆盖的地区,会抛出异常,虽然经过方案二的处理但是有的品牌下没有覆盖省及直辖市,此时plist=[] continue else: for p in plist: pname = p['name'] purl = p['url'] pshow = p['show'] pnum = 0 try: pnum = int(p['num']) except Exception, e: logger.critical(e.message) raise finally: pnum = 0
def get_dealer_telephone(dealer_id): """ 由商家ID获取商家电话,输入商家id为字符串 :param dealer_id: :return: """ url = 'http://autocall.bitauto.com/eil/das2.ashx?userid=' + str( dealer_id) + '&mediaid=10&source=bitauto' response = general_helper.get_response(url, False) response = response.text telstr = re.findall(r'(?<=tel\"\:").*?(?=\")', str(response)) if not telstr: tel = None else: tel = telstr[0].decode('utf-8') return tel
def get_and_insert_car(brand_serial_list): """根据车型ID获取车款信息,以及补充车型信息 可以得到以下信息 serial_spell, serial_show_name, car_id, car_name, car_gear, car_engine, car_msrp, car_sale_year :param brand_serial_list: :return:主品牌品牌车型车款列表 """ serial_url_base = 'http://car.bitauto.com/tree_chexing/sb_@serial_id' for brand in brand_serial_list: logger.debug('brand: %s' % (brand['brand_name'])) brand_serial_car_list = [] for serial in brand['serial']: logger.debug('serial: %s' % (serial['serial_name'])) serial_id = serial['serial_id'] serial_url = serial_url_base.replace('@serial_id', str(serial_id)) logger.debug('url: %s' % serial_url) content = general_helper.get_response(serial_url) html = etree.HTML(content) serial_spell = str( html.xpath( '//div[@class="section-header header1"]/div/h2/a/@href') [0])[1:-1] serial_show_name = str( html.xpath( '//div[@class="section-header header1"]/div/h2/a/text()') [0]).decode('utf-8') car_row_list = html.xpath('//table[@id="compare_sale"]/tbody/tr') for car_row in car_row_list: if 'class' in car_row.attrib and car_row.attrib[ 'class'] == 'table-tit': # 分组表头 car_engine = str( car_row.xpath( 'normalize-space(th[@class="first-item"])') ).decode('utf-8') else: # 车款 car_id = int( re.search(r'\d+', car_row.attrib['id']).group().strip()) car_name = str(car_row.xpath('td/a/text()') [0]).strip().decode('utf-8') car_gear = str( car_row.xpath('string(td[3])')).strip().decode('utf-8') car_msrp_match = re.search( r'(\d+(\.\d+)?)', str( car_row.xpath('string(td[@class="txt-right"]/span)' )).strip()) car_msrp = car_msrp_match.group( ) if car_msrp_match else 0.0 car_sale_year = re.search(r'^\d+', car_name).group() or '' brand_serial_car = { 'main_brand_id': brand['main_brand_id'], 'main_brand_name': brand['main_brand_name'], 'brand_id': brand['brand_id'], 'brand_name': brand['brand_name'], 'serial_id': serial['serial_id'], 'serial_name': serial['serial_name'], 'serial_spell': serial_spell, 'serial_show_name': serial_show_name, 'car_id': car_id, 'car_name': car_name, 'car_gear': car_gear, 'car_engine': car_engine, 'car_msrp': car_msrp, 'car_sale_year': car_sale_year } brand_serial_car_list.append(brand_serial_car) insert_car_to_db(brand_serial_car_list)
def get_dealer(lurl, location): """从一个品牌区域的url获取商家信息 :param lurl: :param location: :return: """ html = general_helper.get_response(lurl) # print html.encode('gbk','ignore') soup = BeautifulSoup(html, 'lxml') # print 'begin get dealer' dealerbox = soup.find('div', 'main-inner-section sm dealer-box') dealerlist = dealerbox.find_all('div', 'row dealer-list') # print dealerlist for dealer in dealerlist: inf = dealer.find('div', 'col-xs-6 left') name = inf.find('h6', 'title-4s').find('a') # 标题 dtype = name.find('em').string # 商家类型 # print dtype durl = re.findall(r'(?<=href=\").*?(?=\")', str(name))[0] # 商家url # print durl dname = re.findall(r'(?<=span>).*?(?=<)', str(name))[0].decode('utf-8') # 商家名称 # print dname dealer_id = int(re.findall(r'(?<=com/)\d+(?=/)', str(durl))[0]) # 商家ID # print dealer_id dpinpai = re.findall(r'(?<=span\>).*?(?=\<)', str(inf.find('p', 'brand')))[0].decode( 'utf-8') # 商家主营品牌 # print dpinpai if inf.find('p', 'promote') != None: # 判断商家是否有正在进行的降价 dpromotetitle = inf.find( 'p', 'promote').find('a').string.decode('utf-8').replace( r'\s+', u' ') dpromoteurl = re.findall( r'(?<=href=\").*?(?=\")', str(inf.find('p', 'promote').find('a')))[0].decode('utf-8') dpromoteday = inf.find('p', 'promote').find( 'span', 'time').string.decode('utf-8') else: dpromotetitle = None # 商家正在进行的降价标题 dpromoteurl = None # 降价新闻的url dpromoteday = None # 剩余天数 # print dpromotetitle # print dpromoteurl # print dpromoteday add = inf.find('p', 'add').find_all( 'span', attrs={'title': True})[0].attrs['title'].replace(u'\xa0', u'') # 商家地址 # print add.encode('gbk','ignore') tel = get_dealer_telephone(dealer_id) dtel = tel # 商家电话 # print dtel.encode('gbk','ignore') try: dsalearea = inf.find('p', 'tel').find('span', 'sales-area').string # 售卖地区 except Exception, e: print lurl, dname, location['pname'], location[ 'mainbrand'], location['bname'], inf.find('p', 'tel') raise dcity = dealer.find('div', 'col-xs-7 middle').p.string.split(' ')[0] # 所在城市 dlocation = dealer.find( 'div', 'col-xs-7 middle').p.string.split(' ')[1].replace(' ', '') # 所在地区 now_time = general_helper.get_now() logger.debug("%s,%s,%s,%s,%s" % (location['pname'], dcity, dlocation, dealer_id, dname)) sql = u"insert into dealer_raw(" \ u"`main_brand_id`,`main_brand_name`,`main_brand_show`,`brand_name`,`brand_show`,`province_name`,`province_show`," \ u"`city_name`,`location_name`,`dealer_type`,`dealer_url`,`dealer_name`,`dealer_id`,`dealer_brand`,`dealer_pro_title`," \ u"`dealer_pro_url`,`dealer_pro_day`,`dealer_add`,`dealer_tel`,`sale_area`,`url`,`create_time`" \ u") values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" params = (location['mainid'], location['mainbrand'], location['mainshow'], location['bname'], location['bshow'], location['pname'], location['pshow'], dcity, dlocation, dtype, durl.decode('utf-8'), dname, dealer_id, dpinpai, dpromotetitle or '', dpromoteurl or '', dpromoteday or '', add, dtel or '', dsalearea, lurl.decode('utf-8'), now_time) # print sql try: mysql.insert(sql, params) except Exception, e: print 'this is an except:', str(e) print sql print location['mainid'], location['mainbrand'], location[ 'mainshow'], location['bname'], location['bshow'], location[ 'pname'], location['pshow'] print dcity, dlocation print dtype, durl, dname, dealer_id print dpinpai print dpromotetitle, dpromoteurl, dpromoteday print add.encode('gbk', 'ignore'), dtel, dsalearea print lurl raise
'mainshow': mbrandshow, 'bname': bname, 'bshow': bshow, 'pname': pname, 'pshow': pshow, 'pnum': pnum } # location['purl']=purl if int( p['num'] ) <= 10: # 如果全省商家不超过10个就不需要往下找市区县,商家列表10个商家一页,对于超过10个的多页会有重复商家,导致抓取到的商家有漏 # print p['name'],' has <10 dealer' get_dealer(purl, location) continue else: phtml = general_helper.get_response(purl) psoup = BeautifulSoup(phtml, 'lxml') # clist=[] clist = get_city(p, psoup) # 获取省下属市 for c in clist: if c['name'] == u'不限': continue else: # print c['name'] cname = c['name'] curl = c['url'] cnum = c['num'] cshow = c['show'] c_url = general_helper.build_url( main_url, c['url'])
# description: # ---------------------------------------------------------------------------------------------------------------------- import sys from bs4 import BeautifulSoup import re from lxml import etree from spiders.yiche_car import insert_car_to_db from utils import general_helper if __name__ == '__main__': reload(sys) sys.setdefaultencoding('utf8') serial_url = 'http://car.bitauto.com/tree_chexing/sb_1608' content = general_helper.get_response(serial_url) html = etree.HTML(content) serial_spell = str( html.xpath('//div[@class="section-header header1"]/div/h2/a/@href') [0])[1:-1] serial_show_name = str( html.xpath('//div[@class="section-header header1"]/div/h2/a/text()') [0]).decode('utf-8') car_row_list = html.xpath('//table[@id="compare_sale"]/tbody/tr') brand_serial_car_list = [] for car_row in car_row_list: if 'class' in car_row.attrib and car_row.attrib[ 'class'] == 'table-tit': # 分组表头 car_engine = str( car_row.xpath('normalize-space(th[@class="first-item"])') ).decode('utf-8')