def get_brand(main_brand_list): """ 获取子品牌和车型(车型没啥用) :param main_brand_list 主品牌列表 :return: 子品牌列表。[{主品牌:,品牌:,车型{}},{同前},{同前}]列表,元素为每个子品牌的信息,包括所属主品牌信息,子品牌信息,下属车型信息(字典) """ brand_list = [] for i in range(len(main_brand_list)): id = main_brand_list[i]['id'] main_brand_name = main_brand_list[i]['name'] main_brand_id = main_brand_list[i]['id'] main_brand_show = main_brand_list[i]['show'] main_brand_num = main_brand_list[i]['num'] main_brand_url = main_brand_list[i]['url'] url = 'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?' \ 'tagtype=jingxiaoshang&pagetype=masterbrand&objid=' + str( id) + '&citycode=beijing%2F&cityid=201' # 获取子品牌真正请求的网址 # print url # print main_brand_name data = general_helper.get_json_response(url) mbrandbox = data['brand'] # print type(mbrandbox) for mbox in mbrandbox.values(): for mb in mbox: if 'child' in mb.keys(): # print 'get it' child = mb['child'] logger.debug("%s,%s,%s,%s" % (main_brand_name, main_brand_id, main_brand_show, len(child))) for b in child: brand = { 'mainbrand': main_brand_name, 'mainid': main_brand_id, 'mainshow': main_brand_show, 'main_url0': main_brand_url, 'mainnum': main_brand_num, 'name': b['name'].decode('utf-8'), 'url': b['url'], 'num': b['num'], 'show': b['url'].split('/')[2] } # print b['url'].split('/') mchild = b['child'] # 品牌下属车型模块 brand['model'] = [] # print brand['name']#,brand['url'],brand['show'] brand_list.append(brand) for m in mchild: model = {} model['name'] = m['name'].decode('utf-8') # 车型名 model['url'] = m['url'] # 车型url # showid=m['url'].split('/')[2].split('-') # id=showid[1] model['show'] = m['url'].split('/')[2] # 车型缩写 model['num'] = m['num'] # 车型经销商数 brand['model'].append(model) # print model['name'] else: continue return brand_list
def get_brand_serial(main_brand_list): """根据主品牌ID获取品牌与车型信息 可以得到以下信息: brand_id brand_name, serial_id serial_name :param main_brand_list: :return:品牌车型列表 """ brand_serial_list = [] brand_serial_url_base = 'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?tagtype=chexing&pagetype=masterbrand&objid=@main_brand_id' for main_brand_item in main_brand_list: brand_serial_url = brand_serial_url_base.replace( '@main_brand_id', str(main_brand_item['main_brand_id'])) data = general_helper.get_json_response(brand_serial_url) main_brand_box = data['brand'] for key1 in main_brand_box: for mb in main_brand_box[key1]: if 'child' in mb: logger.debug(main_brand_item['main_brand_name']) for brand_item in mb['child'] or []: brand = { 'main_brand_id': main_brand_item['main_brand_id'], 'main_brand_name': main_brand_item['main_brand_name'], 'brand_id': int( re.search(r'\d+', str(brand_item['url'])).group()), 'brand_name': brand_item['name'].decode('utf-8'), 'serial': [] } if 'child' in brand_item: for serial_item in brand_item['child'] or []: serial = { 'serial_id': int( re.search( r'\d+', str(serial_item['url'])).group()), 'serial_name': serial_item['name'].decode('utf-8') } brand['serial'].append(serial) brand_serial_list.append(brand) else: continue else: continue return brand_serial_list pass
def get_all_dealer(brand_list): """ 根据品牌获取品牌覆盖地区再获取商家信息 :param brand_list :return:空。数据存入 """ reload(sys) sys.setdefaultencoding("utf-8") # conn,cur=Linksql('192.168.10.71','datacrowler','1qazXSW@','PriceCrawlerDB') conn = None cur = None for brand in brand_list: # 每个品牌 logger.debug('crawling: %s' % brand['name']) mbrandname = brand['mainbrand'] mbrandid = brand['mainid'] mbrandshow = brand['mainshow'] bname = brand['name'] bshow = brand['show'] if brand['num'] == 0: # 品牌后数字为0即该品牌没有商家 continue else: logger.info("get dealers of %s, %s, %s" % (mbrandname, bname, brand['num'] or 0)) burl = general_helper.build_url(main_url, brand['url']) bhtml = general_helper.get_response(burl) bsoup = BeautifulSoup(bhtml, 'lxml') try: # 有的品牌无法从商家列表上边的区域位置按钮弹层中获得该品牌的覆盖的地区,会抛出异常 plist = get_location(bsoup, 'ul', 'layer-txt-list') except Exception, e: plist = get_province(main_url, bshow) # 此时采取第二种方案 logger.critical( "%s, %s, %s" % (bname, len(plist), ' this brand don\'t have dealer')) raise # brand['location']=[] if len( plist ) == 0: # 有的品牌无法从商家列表上边的区域位置按钮弹层中获得该品牌的覆盖的地区,会抛出异常,虽然经过方案二的处理但是有的品牌下没有覆盖省及直辖市,此时plist=[] continue else: for p in plist: pname = p['name'] purl = p['url'] pshow = p['show'] pnum = 0 try: pnum = int(p['num']) except Exception, e: logger.critical(e.message) raise finally: pnum = 0
def crawl(): logger.debug("getting main brand list") main_brand_list = get_main_brand() logger.debug('getting brand serial list') brand_serial_list = get_brand_serial(main_brand_list) logger.debug('getting and inserting car list') get_and_insert_car(brand_serial_list) pass
def get_and_insert_car(brand_serial_list): """根据车型ID获取车款信息,以及补充车型信息 可以得到以下信息 serial_spell, serial_show_name, car_id, car_name, car_gear, car_engine, car_msrp, car_sale_year :param brand_serial_list: :return:主品牌品牌车型车款列表 """ serial_url_base = 'http://car.bitauto.com/tree_chexing/sb_@serial_id' for brand in brand_serial_list: logger.debug('brand: %s' % (brand['brand_name'])) brand_serial_car_list = [] for serial in brand['serial']: logger.debug('serial: %s' % (serial['serial_name'])) serial_id = serial['serial_id'] serial_url = serial_url_base.replace('@serial_id', str(serial_id)) logger.debug('url: %s' % serial_url) content = general_helper.get_response(serial_url) html = etree.HTML(content) serial_spell = str( html.xpath( '//div[@class="section-header header1"]/div/h2/a/@href') [0])[1:-1] serial_show_name = str( html.xpath( '//div[@class="section-header header1"]/div/h2/a/text()') [0]).decode('utf-8') car_row_list = html.xpath('//table[@id="compare_sale"]/tbody/tr') for car_row in car_row_list: if 'class' in car_row.attrib and car_row.attrib[ 'class'] == 'table-tit': # 分组表头 car_engine = str( car_row.xpath( 'normalize-space(th[@class="first-item"])') ).decode('utf-8') else: # 车款 car_id = int( re.search(r'\d+', car_row.attrib['id']).group().strip()) car_name = str(car_row.xpath('td/a/text()') [0]).strip().decode('utf-8') car_gear = str( car_row.xpath('string(td[3])')).strip().decode('utf-8') car_msrp_match = re.search( r'(\d+(\.\d+)?)', str( car_row.xpath('string(td[@class="txt-right"]/span)' )).strip()) car_msrp = car_msrp_match.group( ) if car_msrp_match else 0.0 car_sale_year = re.search(r'^\d+', car_name).group() or '' brand_serial_car = { 'main_brand_id': brand['main_brand_id'], 'main_brand_name': brand['main_brand_name'], 'brand_id': brand['brand_id'], 'brand_name': brand['brand_name'], 'serial_id': serial['serial_id'], 'serial_name': serial['serial_name'], 'serial_spell': serial_spell, 'serial_show_name': serial_show_name, 'car_id': car_id, 'car_name': car_name, 'car_gear': car_gear, 'car_engine': car_engine, 'car_msrp': car_msrp, 'car_sale_year': car_sale_year } brand_serial_car_list.append(brand_serial_car) insert_car_to_db(brand_serial_car_list)
def get_dealer(lurl, location): """从一个品牌区域的url获取商家信息 :param lurl: :param location: :return: """ html = general_helper.get_response(lurl) # print html.encode('gbk','ignore') soup = BeautifulSoup(html, 'lxml') # print 'begin get dealer' dealerbox = soup.find('div', 'main-inner-section sm dealer-box') dealerlist = dealerbox.find_all('div', 'row dealer-list') # print dealerlist for dealer in dealerlist: inf = dealer.find('div', 'col-xs-6 left') name = inf.find('h6', 'title-4s').find('a') # 标题 dtype = name.find('em').string # 商家类型 # print dtype durl = re.findall(r'(?<=href=\").*?(?=\")', str(name))[0] # 商家url # print durl dname = re.findall(r'(?<=span>).*?(?=<)', str(name))[0].decode('utf-8') # 商家名称 # print dname dealer_id = int(re.findall(r'(?<=com/)\d+(?=/)', str(durl))[0]) # 商家ID # print dealer_id dpinpai = re.findall(r'(?<=span\>).*?(?=\<)', str(inf.find('p', 'brand')))[0].decode( 'utf-8') # 商家主营品牌 # print dpinpai if inf.find('p', 'promote') != None: # 判断商家是否有正在进行的降价 dpromotetitle = inf.find( 'p', 'promote').find('a').string.decode('utf-8').replace( r'\s+', u' ') dpromoteurl = re.findall( r'(?<=href=\").*?(?=\")', str(inf.find('p', 'promote').find('a')))[0].decode('utf-8') dpromoteday = inf.find('p', 'promote').find( 'span', 'time').string.decode('utf-8') else: dpromotetitle = None # 商家正在进行的降价标题 dpromoteurl = None # 降价新闻的url dpromoteday = None # 剩余天数 # print dpromotetitle # print dpromoteurl # print dpromoteday add = inf.find('p', 'add').find_all( 'span', attrs={'title': True})[0].attrs['title'].replace(u'\xa0', u'') # 商家地址 # print add.encode('gbk','ignore') tel = get_dealer_telephone(dealer_id) dtel = tel # 商家电话 # print dtel.encode('gbk','ignore') try: dsalearea = inf.find('p', 'tel').find('span', 'sales-area').string # 售卖地区 except Exception, e: print lurl, dname, location['pname'], location[ 'mainbrand'], location['bname'], inf.find('p', 'tel') raise dcity = dealer.find('div', 'col-xs-7 middle').p.string.split(' ')[0] # 所在城市 dlocation = dealer.find( 'div', 'col-xs-7 middle').p.string.split(' ')[1].replace(' ', '') # 所在地区 now_time = general_helper.get_now() logger.debug("%s,%s,%s,%s,%s" % (location['pname'], dcity, dlocation, dealer_id, dname)) sql = u"insert into dealer_raw(" \ u"`main_brand_id`,`main_brand_name`,`main_brand_show`,`brand_name`,`brand_show`,`province_name`,`province_show`," \ u"`city_name`,`location_name`,`dealer_type`,`dealer_url`,`dealer_name`,`dealer_id`,`dealer_brand`,`dealer_pro_title`," \ u"`dealer_pro_url`,`dealer_pro_day`,`dealer_add`,`dealer_tel`,`sale_area`,`url`,`create_time`" \ u") values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" params = (location['mainid'], location['mainbrand'], location['mainshow'], location['bname'], location['bshow'], location['pname'], location['pshow'], dcity, dlocation, dtype, durl.decode('utf-8'), dname, dealer_id, dpinpai, dpromotetitle or '', dpromoteurl or '', dpromoteday or '', add, dtel or '', dsalearea, lurl.decode('utf-8'), now_time) # print sql try: mysql.insert(sql, params) except Exception, e: print 'this is an except:', str(e) print sql print location['mainid'], location['mainbrand'], location[ 'mainshow'], location['bname'], location['bshow'], location[ 'pname'], location['pshow'] print dcity, dlocation print dtype, durl, dname, dealer_id print dpinpai print dpromotetitle, dpromoteurl, dpromoteday print add.encode('gbk', 'ignore'), dtel, dsalearea print lurl raise