def crawl(): # 获取所有省份 logger.info("start: get all provinces.") get_all_province() logger.info("finish: get all provinces.") # 获取主品牌和品牌 logger.info("start: get main brands.") main_brand_list = get_main_brand() logger.info("finish: get main brands.") logger.info("start: get brands.") brand_list = get_brand(main_brand_list) logger.info("finish: get brands.") success = 0 start_time = general_helper.get_now() # print start_time sql = u"insert into crawl_log (project_name,complete_success,start_time) values (%s, %s, %s)" params = (u'易车商家抓取', success, start_time) mysql.insert(sql, params) # get_all_dealer(brand_list[0:1]) l = len(brand_list) a = l / 4 b = (l / 4) * 2 c = (l / 4) * 3 print a, b, c brand_list1 = brand_list[0:a] brand_list2 = brand_list[a:b] brand_list3 = brand_list[b:c] brand_list4 = brand_list[c:] print len(brand_list1), len(brand_list2), len(brand_list3), len( brand_list4) p1 = multiprocessing.Process(target=get_all_dealer, args=(brand_list1, )) p2 = multiprocessing.Process(target=get_all_dealer, args=(brand_list2, )) p3 = multiprocessing.Process(target=get_all_dealer, args=(brand_list3, )) p4 = multiprocessing.Process(target=get_all_dealer, args=(brand_list4, )) p1.start() # 启动进程 p2.start() p3.start() p4.start() p1.join() # 等子进程结束才执行主进程 p2.join() p3.join() p4.join() success = 1 end_time = general_helper.get_now() sql = u"update crawl_log set complete_success = %s, end_time = %s where id = (" \ u"select id from ( " \ u"select max(id) as id from crawl_log as a where project_name= %s ) as s)" params = (success, end_time, project_name) mysql.update(sql, params)
def insert_car_to_db(brand_serial_car_list): """ 将车款数据插入数据库 :param brand_serial_car_list: :return: """ sql = 'INSERT INTO car_data.car ( main_brand_id, main_brand_name, brand_id, brand_name, ' \ 'serial_id, serial_name, serial_spell, serial_show_name, car_id, car_name, car_gear, ' \ 'car_engine, car_msrp, car_sale_year, create_time) ' \ 'VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);' now = general_helper.get_now() params_list = [] for car in brand_serial_car_list: params = (car['main_brand_id'], car['main_brand_name'], car['brand_id'], car['brand_name'], car['serial_id'], car['serial_name'], car['serial_spell'], car['serial_show_name'], car['car_id'], car['car_name'], car['car_gear'], car['car_engine'], car['car_msrp'] or 0.0, car['car_sale_year'], now) params_list.append(params) mysql.insert_batch(sql, params_list) pass
def get_all_province(): """ 获取区域 从北京站获取的省和直辖市是全面的,获取每个省和直辖市的名字,每个省和直辖市的品牌是不一样的, 本来是从每个品牌商家列表上边的区域位置按钮弹层中获得该品牌的覆盖的地区,但是有的品牌不支持这样, 所以对这些品牌应先获得所有的地区,然后与各品牌拼接url。顺便写入数据库 以上两种方法结合互补执行 :return: """ bhtml = general_helper.get_response( 'http://dealer.bitauto.com/beijing/audi/') bsoup = BeautifulSoup(bhtml, 'lxml') provincelist = bsoup.find('ul', 'layer-txt-list').find_all('li') plist = [] for province in provincelist: p = {} p['url'] = re.findall(r'(?<=href=\").*?(?=\">)', str(province))[0].decode('utf-8') p['name'] = re.findall(r'(?<=0\">).*?(?=<)', str(province))[0].decode('utf-8') p['show'] = p['url'].split('/')[1].decode('utf-8') p['num'] = re.findall(r'(?<=\().*?(?=\))', str(province))[0] now_time = general_helper.get_now() sql = u"insert into province (`name`,`show`,`url`,`create_time`)\ values ( %s,%s,%s,%s)" params = (p['name'], p['show'], p['url'], now_time) mysql.insert(sql, params) plist.append(p) return plist
# -*- coding: utf-8 -*- # ---------------------------------------------------------------------------------------------------------------------- # file: province_test # author: eva # date: 2018/1/12 # version: # description: # ---------------------------------------------------------------------------------------------------------------------- from utils import general_helper from utils.commons import mysql if __name__ == '__main__': # 1. insert now_time = general_helper.get_now() sql = u"insert into province (`name`,`show`,`url`,`create_time`)\ values ( %s,%s,%s, %s)" params = ('北京', 'beijing', 'http://beijing.bitauto.com', now_time) mysql.insert(sql, params) # 2. select sql = u"select distinct `name`,`show` from province" params = ('北京', 'beijing', 'http://beijing.bitauto.com', now_time) records = mysql.select(sql) print len(records) pass
def get_dealer(lurl, location): """从一个品牌区域的url获取商家信息 :param lurl: :param location: :return: """ html = general_helper.get_response(lurl) # print html.encode('gbk','ignore') soup = BeautifulSoup(html, 'lxml') # print 'begin get dealer' dealerbox = soup.find('div', 'main-inner-section sm dealer-box') dealerlist = dealerbox.find_all('div', 'row dealer-list') # print dealerlist for dealer in dealerlist: inf = dealer.find('div', 'col-xs-6 left') name = inf.find('h6', 'title-4s').find('a') # 标题 dtype = name.find('em').string # 商家类型 # print dtype durl = re.findall(r'(?<=href=\").*?(?=\")', str(name))[0] # 商家url # print durl dname = re.findall(r'(?<=span>).*?(?=<)', str(name))[0].decode('utf-8') # 商家名称 # print dname dealer_id = int(re.findall(r'(?<=com/)\d+(?=/)', str(durl))[0]) # 商家ID # print dealer_id dpinpai = re.findall(r'(?<=span\>).*?(?=\<)', str(inf.find('p', 'brand')))[0].decode( 'utf-8') # 商家主营品牌 # print dpinpai if inf.find('p', 'promote') != None: # 判断商家是否有正在进行的降价 dpromotetitle = inf.find( 'p', 'promote').find('a').string.decode('utf-8').replace( r'\s+', u' ') dpromoteurl = re.findall( r'(?<=href=\").*?(?=\")', str(inf.find('p', 'promote').find('a')))[0].decode('utf-8') dpromoteday = inf.find('p', 'promote').find( 'span', 'time').string.decode('utf-8') else: dpromotetitle = None # 商家正在进行的降价标题 dpromoteurl = None # 降价新闻的url dpromoteday = None # 剩余天数 # print dpromotetitle # print dpromoteurl # print dpromoteday add = inf.find('p', 'add').find_all( 'span', attrs={'title': True})[0].attrs['title'].replace(u'\xa0', u'') # 商家地址 # print add.encode('gbk','ignore') tel = get_dealer_telephone(dealer_id) dtel = tel # 商家电话 # print dtel.encode('gbk','ignore') try: dsalearea = inf.find('p', 'tel').find('span', 'sales-area').string # 售卖地区 except Exception, e: print lurl, dname, location['pname'], location[ 'mainbrand'], location['bname'], inf.find('p', 'tel') raise dcity = dealer.find('div', 'col-xs-7 middle').p.string.split(' ')[0] # 所在城市 dlocation = dealer.find( 'div', 'col-xs-7 middle').p.string.split(' ')[1].replace(' ', '') # 所在地区 now_time = general_helper.get_now() logger.debug("%s,%s,%s,%s,%s" % (location['pname'], dcity, dlocation, dealer_id, dname)) sql = u"insert into dealer_raw(" \ u"`main_brand_id`,`main_brand_name`,`main_brand_show`,`brand_name`,`brand_show`,`province_name`,`province_show`," \ u"`city_name`,`location_name`,`dealer_type`,`dealer_url`,`dealer_name`,`dealer_id`,`dealer_brand`,`dealer_pro_title`," \ u"`dealer_pro_url`,`dealer_pro_day`,`dealer_add`,`dealer_tel`,`sale_area`,`url`,`create_time`" \ u") values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" params = (location['mainid'], location['mainbrand'], location['mainshow'], location['bname'], location['bshow'], location['pname'], location['pshow'], dcity, dlocation, dtype, durl.decode('utf-8'), dname, dealer_id, dpinpai, dpromotetitle or '', dpromoteurl or '', dpromoteday or '', add, dtel or '', dsalearea, lurl.decode('utf-8'), now_time) # print sql try: mysql.insert(sql, params) except Exception, e: print 'this is an except:', str(e) print sql print location['mainid'], location['mainbrand'], location[ 'mainshow'], location['bname'], location['bshow'], location[ 'pname'], location['pshow'] print dcity, dlocation print dtype, durl, dname, dealer_id print dpinpai print dpromotetitle, dpromoteurl, dpromoteday print add.encode('gbk', 'ignore'), dtel, dsalearea print lurl raise
# -*- coding: utf-8 -*- # ---------------------------------------------------------------------------------------------------------------------- # file: mysqldb_helper_test # author: eva # date: 2018/1/12 # version: # description: # ---------------------------------------------------------------------------------------------------------------------- from utils import general_helper from utils.commons import mysql if __name__ == '__main__': # 1. insert success = 0 start_time = general_helper.get_now() # print start_time sql = u"insert into crawl_log (project_name,complete_success,start_time) VALUES (%s, %s, %s)" params = (u'易车商家抓取', success, start_time) mysql.insert(sql, params) success = 1 end_time = general_helper.get_now() sql = u"update crawl_log set complete_success = %s, end_time = %s where id = (" \ u"select id from ( " \ u"select max(id) as id from crawl_log as a where project_name='易车商家抓取') as s)" params = (success, end_time) mysql.update(sql, params) pass