def get_rent_perregion(district): url = BASE_URL + u"zufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_sh_total_pages(url) if total_pages == None: row = model.Rentinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"zufang/%s/d%d/" % (district, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetRentByRegionlist", district, page + 1, total_pages) data_source = [] nameList = soup.findAll("div", {"class": "info-panel"}) for name in nameList: i = i + 1 info_dict = {} try: info = name.find("a", {"name": "selectDetail"}) info_dict.update({u'title': info.get('title')}) info_dict.update({u'link': info.get('href')}) info_dict.update({u'houseID': info.get('key')}) where = name.find("div", {"class": "where"}) wheres = where.find_all("span") info_dict.update({u'region': wheres[0].get_text().strip()}) info_dict.update({u'zone': wheres[1].get_text().strip()}) info_dict.update({u'meters': wheres[2].get_text().strip()}) other = name.find("div", {"class": "con"}) info_dict.update({u'other': "".join(other.get_text().split())}) info_dict.update({u'subway': ""}) info_dict.update({u'decoration': ""}) info_dict.update({u'heating': ""}) price = name.find("div", {"class": "price"}) info_dict.update( {u'price': int(price.span.get_text().strip())}) pricepre = name.find("div", {"class": "price-pre"}) info_dict.update( {u'pricepre': "".join(pricepre.get_text().split())}) except: continue # Rentinfo insert into mysql data_source.append(info_dict) # model.Rentinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): model.Rentinfo.insert_many(data_source).upsert().execute() time.sleep(1)
def get_rent_perregion(city, district): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"zufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) print(total_pages) if total_pages == None: row = model.Rentinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) #todo for page in range(total_pages): if page > 0: url_page = baseUrl + u"zufang/%s/pg%d/" % (district, page) print(url_page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetRentByRegionlist", district, page + 1, total_pages) data_source = [] for ultag in soup.findAll("div", {"class": "content__list"}): for name in ultag.find_all('div'): i = i + 1 info_dict = {} try: housetitle = name.find( "p", {"class": "content__list--item--title"}) regionZone = name.find( "p", {"class": "content__list--item--des"}) region = regionZone.a.get_text().strip() zone = regionZone.a.next_sibling.next_sibling.get_text( ).strip() info_dict = get_detail_info(city, housetitle.a.get("href")) info_dict.update({u'region': region}) info_dict.update({u'zone': zone}) except: print 'traceback.format_exc():\n%s' % traceback.format_exc( ) continue # Rentinfo insert into mysql data_source.append(info_dict) # model.Rentinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): if data_source: model.Rentinfo.insert_many(data_source).upsert().execute() time.sleep(1)
def get_community_perregion(regionname=u'xicheng'): url = BASE_URL + u"xiaoqu/" + regionname + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Community.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"xiaoqu/" + regionname + "/pg%d/" % (page + 1, ) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("li", {"class": "clear"}) i = 0 for name in nameList: # Per house loop i = i + 1 info_dict = {} try: communitytitle = name.find("div", {"class": "title"}) info_dict.update( {u'title': communitytitle.get_text().strip('\n')}) info_dict.update({u'link': communitytitle.a.get('href')}) district = name.find("a", {"class": "district"}) info_dict.update({u'district': district.get_text()}) bizcircle = name.find("a", {"class": "bizcircle"}) info_dict.update({u'bizcircle': bizcircle.get_text()}) tagList = name.find("div", {"class": "tagList"}) info_dict.update({u'tagList': tagList.get_text().strip('\n')}) onsale = name.find("a", {"class": "totalSellCount"}) info_dict.update( {u'onsale': onsale.span.get_text().strip('\n')}) info_dict.update({u'id': name.get('data-housecode')}) except: continue # communityinfo insert into mysql model.Community.insert(**info_dict).upsert().execute() time.sleep(1)
def get_house_by_url(url): source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return intro = soup.find(id="introduction") transaction_labels = intro.find_all(has_span_with_class_label) name_dict = { u'配备电梯': 'elevator', u'产权所属': 'propertytype', u'建筑结构': 'buildingstructure', u'建筑类型': 'buildingtype', u'梯户比例': 'elevatorratio', u'交易权属': 'transactionownership' } res = {} for label in transaction_labels: spans = label.find_all("span") if len(spans) > 1: key = spans[0].string.strip() val = spans[1].string.strip() if key in name_dict: res[name_dict[key]] = val elif len(spans) > 0: key = spans[0].string.strip() val = label.get_text() if key in name_dict: res[name_dict[key]] = val return res
def getSeltInfoLastMonth(url): data_source = [] info_dict = {} html = misc.get_source_code(url) soup = BeautifulSoup(html, 'lxml') totalPrice = soup.find('span', class_='record_price').get_text()[:-1] unitPrice = soup.find( 'p', class_='record_detail').get_text().split(',')[0][2:-3] dealdate = soup.find('p', class_='record_detail').get_text().split(',')[2] info_dict.update({'totalPrice': totalPrice}) info_dict.update({'unitPrice': unitPrice}) info_dict.update({'dealdate': dealdate}) ''' msg = soup.find('div',class_='msg').get_text() msgSplit = re.split(r'(\W+)',msg) guapaiPrince = msgSplit[0][:-4] chengjiaoZhouqi = msgSplit[4][:-4] adjustPriceCount = msgSplit[8][:-2] daikanCount = msgSplit[12][:-2] guanzhuCount = msgSplit[16][:-2] lookCount = msgSplit[20][:-2] ''' msgSplit = soup.select( 'body > section.wrapper > div.overview > div.info.fr > div.msg > span > label' ) #guapaiPrince = msgSplit[0].get_text() chengjiaoZhouqi = msgSplit[1].get_text() #adjustPriceCount = msgSplit[2].get_text() daikanCount = msgSplit[3].get_text() guanzhuCount = msgSplit[4].get_text() lookCount = msgSplit[5].get_text() #info_dict.update({'guapaiPrice':guapaiPrince}) info_dict.update({'transactionCycle': chengjiaoZhouqi}) #info_dict.update({'adjustPriceCount':adjustPriceCount}) info_dict.update({'numberOfVisits': daikanCount}) info_dict.update({'followers': guanzhuCount}) info_dict.update({'pageView': lookCount}) ''' baseInfo = soup.find('div', class_='content').get_text().split() buildYears = baseInfo[8][4:] warmStyle = baseInfo[11][4:] propertyRight = baseInfo[13][4:] ''' baseInfo = soup.find('div', class_='content').findAll('li') buildYears = baseInfo[7].get_text()[4:].strip() warmStyle = baseInfo[10].get_text()[4:].strip() propertyRight = baseInfo[12].get_text()[4:].strip() info_dict.update({'buildyears': buildYears}) info_dict.update({'warmStyle': warmStyle}) info_dict.update({'propertyRight': propertyRight}) #data_source.append(info_dict) return info_dict
def get_communityinfo_by_url(url): try: source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') res = {} if check_block(soup): return res communityinfos = soup.findAll("div", {"class": "xiaoquInfoItem"}) for info in communityinfos: key_type = { u"建筑年代": u'year', u"建筑类型": u'housetype', u"物业费用": u'cost', u"物业公司": u'service', u"开发商": u'company', u"楼栋总数": u'building_num', u"房屋总数": u'house_num', } try: key = info.find("span", {"xiaoquInfoLabel"}) value = info.find("span", {"xiaoquInfoContent"}) key_info = key_type[key.get_text().strip()] value_info = value.get_text().strip() res.update({key_info: value_info}) except Exception as e: # logging.error(e) # 这里不需要打印该log,多余的字段不需要解析 # u"附近门店": u'store_near', continue return res except Exception as e: print(e, traceback.print_exc())
def get_communityinfo_by_url(url): source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return communityinfos = soup.findAll("div", {"class": "xiaoquInfoItem"}) res = {} for info in communityinfos: key_type = { u"建筑年代": u'year', u"建筑类型": u'housetype', u"物业费用": u'cost', u"物业公司": u'service', u"开发商": u'company', u"楼栋总数": u'building_num', u"房屋总数": u'house_num', } try: key = info.find("span", {"xiaoquInfoLabel"}) value = info.find("span", {"xiaoquInfoContent"}) key_info = key_type[key.get_text().strip()] value_info = value.get_text().strip() res.update({key_info: value_info}) except: continue return res
def get_communityinfo_by_url(url): source_code = misc.get_source_code(BASE_URL + url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return communityinfos = soup.findAll("div", {"class": "col-2 clearfix"}) res = {} for info in communityinfos: try: infos = info.findAll("li") housetype = infos[0].find("span", {"class": "other"}) year = infos[1].find("span", {"class": "other"}) cost = infos[2].find("span", {"class": "other"}) service = infos[3].span.find(text=True, recursive=False) company = infos[4].span.find(text=True, recursive=False) res.update({'housetype': housetype.get_text().strip()}) res.update({'year': year.get_text().strip()}) res.update({'cost': cost.get_text().strip()}) res.update({'service': service.strip()}) res.update({'company': company.strip()}) except: continue return res
def GetSellByCommunitylist(): with open('community_id.txt') as f: for line in f.readlines(): data_source = [] code = line.split(' ')[0] communityinfo = line.split(' ')[1] pages = get_totalpage("https://bj.5i5j.com/sold/%s" % code) for page in range(1,pages+1): source_code = misc.get_source_code("https://bj.5i5j.com/sold/%s/n%d/" % (code,page)) soup = BeautifulSoup(source_code, 'lxml') content = soup.find('ul', class_="pList zu") try: lists = content.find_all('li') except: continue for each in lists: info_dict = {} sTit = each.find("p", {"class":"sTit"}) title = sTit.strong.get_text().strip() community = title.split(' ')[0] listCon = each.find("div", {"class":"listCon"}) plist = listCon.find_all("p") housetype = plist[1].get_text().strip().split(u'·')[0] square = plist[1].get_text().strip().split(u'·')[1] direction = plist[1].get_text().strip().split(u'·')[2] dealdate = plist[2].get_text().strip().split(u':')[1] jiage = each.find("div", {"class":"jiage"}) totalPrice = jiage.strong.get_text().strip() unitPrice = find_between_r(jiage.p.get_text().strip(),u'价',u'元') source=u"我爱我家" status=u"暂无信息" floor=u"暂无信息" years=u"暂无信息" link = "https://bj.5i5j.com%s" % each.a.get("href") houseID = "5i5j%s" % find_between_r(each.a.get("href"),'/','.') info_dict.update({u'title':title}) info_dict.update({u'houseID':houseID}) info_dict.update({u'link':link}) info_dict.update({u'community':community}) info_dict.update({u'years':years}) info_dict.update({u'housetype':housetype}) info_dict.update({u'square':square}) info_dict.update({u'direction':direction}) info_dict.update({u'floor':floor}) info_dict.update({u'status':status}) info_dict.update({u'source':source}) info_dict.update({u'totalPrice':totalPrice }) info_dict.update({u'unitPrice':unitPrice }) info_dict.update({u'dealdate':dealdate }) data_source.append(info_dict) with model.database.atomic(): try: model.Sellinfo.insert_many(data_source).upsert().execute() except: pass logging.info("%s finish" % communityinfo) time.sleep(1)
def get_totalpage(url): source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') info = soup.find("div", {"class": "pageSty rf"}) if info == None: return 1 alist = info.find_all("a") page = int(alist[1].get_text().strip()) return page
def get_district_of_city(city): ret = [] logging.info("Get District Infomation") url = "https://%s.lianjia.com/chengjiao/" % (city) source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return intro_wrapper = soup.find_all("div", {"data-role": "ershoufang"})[0] for x in intro_wrapper.find_all("a"): ret.append(x.get("href").split('/')[2]) print(ret) return ret
def getRentInfoFromCommunity(search_community, rid, hid): if rid in search_community: return search_community url = 'https://bj.lianjia.com/zufang/housestat?hid=%s&rid=%s' % (hid, rid) try: source_code = misc.get_source_code(url) json_obj = json.loads(source_code) rentData = json_obj['data']['resblockSold'] data_source = [] logging.info("Progress: %s: %s" % ('getRentInfoFromCommunity', url)) for rentInfo in rentData: info_dict = {} info_dict.update({ u'title': rentInfo['resblockName'] + u' ' + rentInfo['title'] }) info_dict.update({u'link': rentInfo['house_url']}) info_dict.update({u'houseID': rentInfo['houseId']}) info_dict.update({u'regionid': rid}) info_dict.update({u'region': rentInfo['resblockName']}) info_dict.update({u'zone': rentInfo['title']}) info_dict.update({u'meters': rentInfo['area']}) info_dict.update({ u'other': rentInfo['floor'] + '/' + rentInfo['totalFloor'] + u'层 ' + rentInfo['orientation'] + ' ' + rentInfo['decoration'] }) info_dict.update({u'subway': ''}) info_dict.update({u'decoration': u'同小区成交记录'}) info_dict.update({u'heating': ''}) info_dict.update({u'price': rentInfo['price']}) info_dict.update({u'pricepre': rentInfo['transDate'] + u' 成交'}) data_source.append(info_dict) with model.database.atomic(): model.Rentinfo.insert_many(data_source).upsert().execute() time.sleep(1) except Exception as e: logging.error(e) search_community.append(rid) return search_community
def get_house_detail(url_page): ret = {} source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return intro_wrapper = soup.find(id='introduction') for ultag in intro_wrapper.findAll('ul'): for litag in ultag.find_all('li'): key = litag.find_all('span')[0] ret_key = translate_1[key.text] # print(ret_key, ret_key in translate_1) ret[ret_key] = litag.text[len(key.text):].strip().encode("utf-8") # print(ret) return ret
def get_subregion_of_city(city): districts = get_district_of_city(city) ret = [] for dis in districts: logging.info("Get Sub-District Infomation %s" % (dis)) url = "https://%s.lianjia.com/xiaoqu/%s/" % (city, dis) source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return intro_wrapper = soup.find_all("div", {"data-role": "ershoufang"})[0] for x in intro_wrapper.find_all("a"): name = x.get("href").split('/')[2] if name not in districts: ret.append(name) print(ret) return sorted(ret)
def get_img(): for data in model.Houseinfo.select(model.Houseinfo.houseID, model.Houseinfo.link): print data.link source_code = misc.get_source_code(data.link) soup = BeautifulSoup(source_code, 'html.parser') li = soup.select("#thumbnail2 li") if len(li) > 0: img = li[0].get('data-src') print img try: model.House_img.insert({ model.House_img.house_id: data.houseID, model.House_img.img: img }).execute() except Exception as e: print(e) continue
def getRegionByArea(areaList): regionList = [] for area in areaList: try: url = BASE_URL + u"ershoufang/%s/" % area source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return for regionInfo in soup.find('div', { "class": "sub_sub_nav" }).find_all('a'): region = regionInfo.get('href').strip().split( '/ershoufang/')[1].rstrip('/') regionList.append(region) except Exception as e: logging.error(e) pass return regionList
def get_sell_perhouseID(houseID): url_page = BASE_URL + u"chengjiao/" + houseID+ ".html" source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') log_progress("GetSellByHouseID", houseID,1,1) info_dict = {} for name in soup.findAll("ul", {"class":"record_list"}): try: totalPrice = name.find("span", {"class":"record_price"}) if totalPrice.span is None: totalPrice = totalPrice.get_text().strip().split(u'万') else: totalPrice = totalPrice.span.get_text().strip().split(u'万') info_dict.update({u'totalPrice':totalPrice[0]}) detail = name.find("p", {"class":"record_detail"}).get_text().split(',') info_dict.update({u'unitPrice':detail[0].replace(u'单价','').replace(u'元/平','')}) info_dict.update({u'dealdate':detail[1].replace('.','-')}) except Exception as e: logging.error(e) logging.info("name:" + name + "Fail") continue try: with model.database.atomic(): '''update_house = model.Monthsellinfo.select().where(Monthsellinfo.houseID == houseID).get() update_house.totalPrice = info_dict[u'totalPrice'] update_house.unitPrice = info_dict[u'unitPrice'] update_house.dealdate = info_dict[u'dealdate'] update_house.save()''' model.Monthsellinfo.update(totalPrice = info_dict[u'totalPrice'], unitPrice = info_dict[u'unitPrice'],dealdate=info_dict[u'dealdate']).where(model.Monthsellinfo.houseID == houseID).execute() time.sleep(1) except Exception as e: logging.error(e) logging.info(houseID + "house info Fail")
def get_communityinfo_by_url(url): source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return communityinfos = soup.findAll("div", {"class": "xiaoquInfoItem"}) res = {} key_type = { u"建筑年代": u'year', u"建筑类型": u'housetype', u"物业费用": u'cost', u"物业公司": u'service', u"开发商": u'company', u"楼栋总数": u'building_num', u"房屋总数": u'house_num', } for info in communityinfos: try: key = info.find("span", {"xiaoquInfoLabel"}) value = info.find("span", {"xiaoquInfoContent"}) key_chinese = key.get_text().strip() if key_chinese not in key_type.keys(): continue key_info = key_type[key_chinese] value_info = value.get_text().strip() res.update({key_info: value_info}) except: logging.exception(info) continue for chinese, english in key_type.items(): if english not in res.keys(): res[english] = None return res
def get_sellInfo_by_url(url): source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return sellInfoMsg = soup.find("div", {"class": "msg"}) res = {} num = 0 if sellInfoMsg == []: res.update({cycle: ''}) res.update({listing_price: ''}) res.update({adjust_num: ''}) res.update({view_num: ''}) res.update({attention_num: ''}) res.update({browse_num: ''}) else: sellInfoMsg = sellInfoMsg.findAll("label") key_type = { 1: u'listing_price', 2: u'cycle', 3: u'adjust_num', 4: u'view_num', 5: u'attention_num', 6: u'browse_num', } for info in sellInfoMsg: try: num += 1 key_info = key_type[num] value_info = info.get_text().strip() res.update({key_info: value_info}) except: continue return res
def get_house_perregion(district): url = BASE_URL + u"ershoufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"ershoufang/%s/pg%d/" % (district, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetHouseByRegionlist", district, page + 1, total_pages) data_source = [] hisprice_data_source = [] for ultag in soup.findAll("ul", {"class": "sellListContent"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update( {u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('data_analysis-housecode') info_dict.update({u'houseID': houseID}) houseinfo = name.find("div", {"class": "houseInfo"}) if CITY == 'bj': info = houseinfo.get_text().split('/') else: info = houseinfo.get_text().split('|') info_dict.update({u'community': info[0]}) info_dict.update({u'housetype': info[1]}) info_dict.update({u'square': info[2]}) info_dict.update({u'direction': info[3]}) info_dict.update({u'decoration': info[4]}) housefloor = name.find("div", {"class": "positionInfo"}) info_dict.update({u'years': housefloor.get_text().strip()}) info_dict.update({u'floor': housefloor.get_text().strip()}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update( {u'followInfo': followInfo.get_text().strip()}) taxfree = name.find("span", {"class": "taxfree"}) if taxfree == None: info_dict.update({u"taxtype": ""}) else: info_dict.update( {u"taxtype": taxfree.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update( {u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {u'unitPrice': unitPrice.get("data_analysis-price")}) except: continue # Houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"] }) # model.Houseinfo.insert(**info_dict).upsert().execute() # model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute() with model.database.atomic(): model.Houseinfo.insert_many(data_source).upsert().execute() model.Hisprice.insert_many(hisprice_data_source).upsert().execute() time.sleep(1)
def get_rent_perregion(district): url = BASE_URL + u"zufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Rentinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"zufang/%s/pg%d/" % (district, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetRentByRegionlist", district, page + 1, total_pages) data_source = [] for ultag in soup.findAll("ul", {"class": "house-lst"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "info-panel"}) info_dict.update( {u'title': housetitle.h2.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get("href")}) houseID = name.get("data_analysis-housecode") info_dict.update({u'houseID': houseID}) region = name.find("span", {"class": "region"}) info_dict.update({u'region': region.get_text().strip()}) zone = name.find("span", {"class": "zone"}) info_dict.update({u'zone': zone.get_text().strip()}) meters = name.find("span", {"class": "meters"}) info_dict.update({u'meters': meters.get_text().strip()}) other = name.find("div", {"class": "con"}) info_dict.update({u'other': other.get_text().strip()}) subway = name.find("span", {"class": "fang-subway-ex"}) if subway == None: info_dict.update({u'subway': ""}) else: info_dict.update( {u'subway': subway.span.get_text().strip()}) decoration = name.find("span", {"class": "decoration-ex"}) if decoration == None: info_dict.update({u'decoration': ""}) else: info_dict.update({ u'decoration': decoration.span.get_text().strip() }) heating = name.find("span", {"class": "heating-ex"}) if decoration == None: info_dict.update({u'heating': ""}) else: info_dict.update( {u'heating': heating.span.get_text().strip()}) price = name.find("div", {"class": "price"}) info_dict.update( {u'price': int(price.span.get_text().strip())}) pricepre = name.find("div", {"class": "price-pre"}) info_dict.update( {u'pricepre': pricepre.get_text().strip()}) except: continue # Rentinfo insert into mysql data_source.append(info_dict) # model.Rentinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): model.Rentinfo.insert_many(data_source).upsert().execute() time.sleep(1)
def get_sell_percommunity(communityname): url = BASE_URL + u"chengjiao/rs" + urllib2.quote( communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Sellinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"chengjiao/pg%drs%s/" % ( page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetSellByCommunitylist", communityname, page + 1, total_pages) data_source = [] for ultag in soup.findAll("ul", {"class": "listContent"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('href').split("/")[-1].split( ".")[0] info_dict.update({u'houseID': houseID.strip()}) house = housetitle.get_text().strip().split(' ') info_dict.update({u'community': house[0].strip()}) info_dict.update({u'housetype': house[1].strip()}) info_dict.update({u'square': house[2].strip()}) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('|') info_dict.update({u'direction': info[0].strip()}) info_dict.update({u'status': info[1].strip()}) housefloor = name.find("div", {"class": "positionInfo"}) floor_all = housefloor.get_text().strip().split(' ') info_dict.update({u'floor': floor_all[0].strip()}) info_dict.update({u'years': floor_all[-1].strip()}) followInfo = name.find("div", {"class": "source"}) info_dict.update( {u'source': followInfo.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) if totalPrice.span is None: info_dict.update( {u'totalPrice': totalPrice.get_text().strip()}) else: info_dict.update({ u'totalPrice': totalPrice.span.get_text().strip() }) unitPrice = name.find("div", {"class": "unitPrice"}) if unitPrice.span is None: info_dict.update( {u'unitPrice': unitPrice.get_text().strip()}) else: info_dict.update( {u'unitPrice': unitPrice.span.get_text().strip()}) dealDate = name.find("div", {"class": "dealDate"}) info_dict.update({ u'dealdate': dealDate.get_text().strip().replace('.', '-') }) except: continue # Sellinfo insert into mysql data_source.append(info_dict) # model.Sellinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): model.Sellinfo.insert_many(data_source).upsert().execute() time.sleep(1)
def get_community_perregion(regionname): url = BASE_URL + u"xiaoqu/" + regionname + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Community.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"xiaoqu/" + regionname + "/pg%d/" % page source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("li", {"class": "clear"}) i = 0 log_progress("GetCommunityByRegionlist", regionname, page + 1, total_pages) data_source = [] for name in nameList: # Per house loop i = i + 1 info_dict = {} try: communitytitle = name.find("div", {"class": "title"}) title = communitytitle.get_text().strip('\n') link = communitytitle.a.get('href') info_dict.update({u'title': title}) info_dict.update({u'link': link}) district = name.find("a", {"class": "district"}) info_dict.update({u'district': district.get_text()}) bizcircle = name.find("a", {"class": "bizcircle"}) info_dict.update({u'bizcircle': bizcircle.get_text()}) tagList = name.find("div", {"class": "tagList"}) info_dict.update({u'tagList': tagList.get_text().strip('\n')}) onsale = name.find("a", {"class": "totalSellCount"}) info_dict.update( {u'onsale': onsale.span.get_text().strip('\n')}) onrent = name.find("a", {"title": title + u"租房"}) info_dict.update( {u'onrent': onrent.get_text().strip('\n').split(u'套')[0]}) info_dict.update({u'id': name.get('data_analysis-housecode')}) price = name.find("div", {"class": "totalPrice"}) info_dict.update({u'price': price.span.get_text().strip('\n')}) communityinfo = get_communityinfo_by_url(link) for key, value in communityinfo.iteritems(): info_dict.update({key: value}) except: continue # communityinfo insert into mysql data_source.append(info_dict) # model.Community.insert(**info_dict).upsert().execute() with model.database.atomic(): model.Community.insert_many(data_source).upsert().execute() time.sleep(1)
def get_rent_percommunity(city, communityname): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"zufang/rs" + \ urllib.parse.quote(communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Rentinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + \ u"rent/pg%drs%s/" % (page, urllib.parse.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetRentByCommunitylist", communityname, page + 1, total_pages) data_source = [] for ultag in soup.findAll("ul", {"class": "house-lst"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "info-panel"}) info_dict.update({u'title': housetitle.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('href').split("/")[-1].split( ".")[0] info_dict.update({u'houseID': houseID}) region = name.find("span", {"class": "region"}) info_dict.update({u'region': region.get_text().strip()}) zone = name.find("span", {"class": "zone"}) info_dict.update({u'zone': zone.get_text().strip()}) meters = name.find("span", {"class": "meters"}) info_dict.update({u'meters': meters.get_text().strip()}) other = name.find("div", {"class": "con"}) info_dict.update({u'other': other.get_text().strip()}) subway = name.find("span", {"class": "fang-subway-ex"}) if subway is None: info_dict.update({u'subway': ""}) else: info_dict.update( {u'subway': subway.span.get_text().strip()}) decoration = name.find("span", {"class": "decoration-ex"}) if decoration is None: info_dict.update({u'decoration': ""}) else: info_dict.update({ u'decoration': decoration.span.get_text().strip() }) heating = name.find("span", {"class": "heating-ex"}) info_dict.update( {u'heating': heating.span.get_text().strip()}) price = name.find("div", {"class": "price"}) info_dict.update( {u'price': int(price.span.get_text().strip())}) pricepre = name.find("div", {"class": "price-pre"}) info_dict.update( {u'pricepre': pricepre.get_text().strip()}) except: continue # Rentinfo insert into mysql data_source.append(info_dict) # model.Rentinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): if data_source: model.Rentinfo.insert_many(data_source).upsert().execute() time.sleep(1)
def get_house_perregion(city, district): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"ershoufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages is None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) #total_pages = 10 house_ids = set() for page in range(total_pages): if page > 0: url_page = baseUrl + u"ershoufang/%s/pg%d/" % (district, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetHouseByRegionlist", district, page + 1, total_pages) data_source = [] hisprice_data_source = [] ultags = soup.findAll("ul", {"class": "sellListContent"}) for ultag in ultags: for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update( {u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('data-housecode') # 对houseID进行去重 if houseID in house_ids: continue info_dict.update({u'houseID': houseID}) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('|') #info_dict.update({u'community': info[0]}) info_dict.update({u'housetype': info[0]}) info_dict.update({u'square': info[1]}) info_dict.update({u'direction': info[2]}) info_dict.update({u'decoration': info[3]}) info_dict.update({u'floor': info[4]}) info_dict.update({u'years': info[5]}) housefloor = name.find("div", {"class": "positionInfo"}) communityInfo = housefloor.get_text().split('-') info_dict.update({u'community': communityInfo[0]}) #info_dict.update({u'years': housefloor.get_text().strip()}) #info_dict.update({u'floor': housefloor.get_text().strip()}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update( {u'followInfo': followInfo.get_text().strip()}) taxfree = name.find("span", {"class": "taxfree"}) if taxfree is None: info_dict.update({u"taxtype": ""}) else: info_dict.update( {u"taxtype": taxfree.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update( {u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {u'unitPrice': unitPrice.get("data-price")}) info_dict.update({"validdate": datetime.datetime.now()}) except: continue # Houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"] }) house_ids.add(info_dict["houseID"]) with model.database.atomic(): try: for data in data_source: model.Houseinfo.insert(data).on_conflict( conflict_target=(model.Houseinfo.houseID, ), update=data, #preserve=(model.Houseinfo.houseID, ), ).execute() model.Hisprice.insert_many(hisprice_data_source).execute() except Exception as e: print("error: %s" % e) log_progress("GetHouseByRegionlist inserted", district, page + 1, total_pages) time.sleep(0.5)
def get_house_perregion(district): url = BASE_URL + u"ershoufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_sh_total_pages(url) if total_pages == None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"ershoufang/%s/d%d/" % (district, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetHouseByRegionlist", district, page + 1, total_pages) data_source = [] hisprice_data_source = [] nameList = soup.findAll("div", {"class": "info"}) for name in nameList: # per house loop i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "prop-title"}) info_dict.update({u'title': housetitle.a.get('title')}) info_dict.update({u'link': housetitle.a.get('href')}) info_dict.update({u'houseID': housetitle.a.get('key')}) houseaddr = name.find("span", {"class": "info-col row1-text"}) info = houseaddr.get_text().split('|') info_dict.update({u'housetype': info[0].strip()}) info_dict.update({u'square': info[1].strip()}) info_dict.update({u'floor': info[2].strip()}) try: info_dict.update({u'direction': info[3].strip()}) except: info_dict.update({u'direction': ''}) info_dict.update({u'decoration': ''}) housefloor = name.find("span", {"class": "info-col row2-text"}) detail = housefloor.get_text().split('|') info_dict.update({u'years': detail[-1].strip()}) community = name.find("a", {"class": "laisuzhou"}) info_dict.update( {u'community': community.span.get_text().strip()}) info_dict.update({u'followInfo': ''}) tax = name.find("div", {"class": "property-tag-container"}) info_dict.update({u'taxtype': "".join(tax.get_text().split())}) totalPrice = name.find("span", {"class": "total-price strong-num"}) info_dict.update( {u'totalPrice': totalPrice.get_text().strip()}) unitPrice = name.find("span", {"class": "info-col price-item minor"}) info_dict.update({u'unitPrice': unitPrice.get_text().strip()}) except: continue # houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"] }) # model.Houseinfo.insert(**info_dict).upsert().execute() #model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute() with model.database.atomic(): model.Houseinfo.insert_many(data_source).upsert().execute() model.Hisprice.insert_many( hisprice_data_source).upsert().execute() time.sleep(1)
def get_community_perregion(regionname=u'pudong'): url = BASE_URL + u"xiaoqu/" + regionname + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_sh_total_pages(url) if total_pages == None: row = model.Community.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"xiaoqu/" + regionname + "/d%d/" % page source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("div", {"class": "info-panel"}) i = 0 log_progress("GetCommunityByRegionlist", regionname, page + 1, total_pages) data_source = [] for name in nameList: # Per house loop i = i + 1 info_dict = {} try: communitytitle = name.find("a", {"name": "selectDetail"}) title = communitytitle.get_text().strip('\n') link = communitytitle.get('href') id = communitytitle.get('key') info_dict.update({u'title': title}) info_dict.update({u'link': link}) info_dict.update({u'id': id}) district = name.find("a", {"class": "ad"}) info_dict.update({u'district': district.get_text()}) cons = name.find("div", {"class": "con"}) bizcircle = cons.findAll("a") info_dict.update( {u'bizcircle': bizcircle[1].get_text().strip()}) try: tagList = name.find("span", {"class": "fang-subway-ex"}) info_dict.update({u'tagList': tagList.get_text().strip()}) except: info_dict.update({u'tagList': ''}) onsale = name.find("span", {"class": "num"}) info_dict.update({u'onsale': onsale.get_text().strip()}) price = name.find("div", {"class": "price"}) info_dict.update({u'price': price.span.get_text().strip()}) communityinfo = get_communityinfo_by_url(link) for key, value in communityinfo.iteritems(): info_dict.update({key: value}) except: continue # communityinfo insert into mysql data_source.append(info_dict) # model.Community.insert(**info_dict).upsert().execute() with model.database.atomic(): model.Community.insert_many(data_source).upsert().execute() time.sleep(1)
def get_sell_percommunity(communityname): url = BASE_URL + u"chengjiao/rs" + \ urllib2.quote(communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_sh_total_pages(url) if total_pages == None: row = model.Sellinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + \ u"chengjiao/d%drs%s/" % (page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetSellByCommunitylist", communityname, page + 1, total_pages) data_source = [] for name in soup.findAll("div", {"class": "info"}): i = i + 1 info_dict = {} try: housetitle = name.findAll("div", {"class": "info-row"})[0] info_dict.update({u'title': housetitle.a.get('title')}) info_dict.update({u'link': housetitle.a.get('href')}) info_dict.update({u'houseID': housetitle.a.get('key')}) houseinfo = housetitle.get_text().strip().split(' ') info_dict.update({u'housetype': houseinfo[1].strip()}) info_dict.update( {u'square': houseinfo[2].strip('').split('\n')[0]}) houseaddr = name.find("div", {"class": "row1-text"}) info = houseaddr.get_text().split('|') info_dict.update({u'floor': info[0].strip()}) try: info_dict.update({u'direction': info[1].strip()}) except: info_dict.update({u'direction': ''}) info_dict.update({u'status': info[2].strip()}) years = name.find("span", {"class": "c-prop-tag2"}) info_dict.update({u'years': years.get_text().strip()}) community = name.find("span", {"class": "cj-text"}) info_dict.update({u'community': community.get_text().strip()}) totalPrice = name.find("span", {"class": "strong-num"}) info_dict.update( {u'totalPrice': totalPrice.get_text().strip()}) unitPrice = name.find("div", {"class": "info-col price-item minor"}) info_dict.update({u'unitPrice': unitPrice.get_text().strip()}) source = name.find("div", {"class": "info-col deal-item minor"}) info_dict.update({u'source': source.get_text().strip()}) dealdate = name.find( "div", {"class": "info-col deal-item main strong-num"}) info_dict.update({ u'dealdate': dealdate.get_text().strip().replace('.', '-') }) except: continue # Sellinfo insert into mysql data_source.append(info_dict) # model.Sellinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): model.Sellinfo.insert_many(data_source).upsert().execute() time.sleep(1)
def get_house_percommunity(communityname): url = BASE_URL + u"ershoufang/rs" + urllib2.quote( communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"ershoufang/pg%drs%s/" % ( page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("li", {"class": "clear"}) i = 0 log_progress("GetHouseByCommunitylist", communityname, page + 1, total_pages) data_source = [] hisprice_data_source = [] for name in nameList: # per house loop i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseaddr = name.find("div", {"class": "address"}) if CITY == 'bj': info = houseaddr.div.get_text().split('/') else: info = houseaddr.div.get_text().split('|') info_dict.update({u'community': info[0].strip()}) info_dict.update({u'housetype': info[1].strip()}) info_dict.update({u'square': info[2].strip()}) info_dict.update({u'direction': info[3].strip()}) info_dict.update({u'decoration': info[4].strip()}) housefloor = name.find("div", {"class": "flood"}) floor_all = housefloor.div.get_text().split( '-')[0].strip().split(' ') info_dict.update({u'floor': floor_all[0].strip()}) info_dict.update({u'years': floor_all[-1].strip()}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update({u'followInfo': followInfo.get_text()}) tax = name.find("div", {"class": "tag"}) info_dict.update({u'taxtype': tax.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update({u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {u'unitPrice': unitPrice.get('data_analysis-price')}) info_dict.update( {u'houseID': unitPrice.get('data_analysis-hid')}) except: continue # houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"] }) # model.Houseinfo.insert(**info_dict).upsert().execute() # model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute() with model.database.atomic(): model.Houseinfo.insert_many(data_source).upsert().execute() model.Hisprice.insert_many(hisprice_data_source).upsert().execute() time.sleep(1)
def get_house_perregion(district): url = BASE_URL + u"ershoufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'html5lib') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): url_page = BASE_URL + u"ershoufang/%s/pg%d/" % (district, page + 1) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'html5lib') i = 0 log_progress("GetHouseByRegionlist", district, page + 1, total_pages) data_source = [] hisprice_data_source = [] for ultag in soup.findAll("ul", {"class": "sellListContent"}): namearr = ultag.find_all('li', {"class": "clear"}) for name in namearr: i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('data-housecode') info_dict.update({u'houseID': houseID}) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('/') info_communityid = houseinfo.a.get('href').split('xiaoqu/') communityid = info_communityid[1].strip().rstrip('/') square_info = info[2].encode("utf-8").split('平米') info_dict.update({u'community': info[0]}) info_dict.update({u'communityid': communityid}) info_dict.update({u'housetype': info[1]}) info_dict.update({u'square': square_info[0]}) info_dict.update({u'direction': info[3]}) info_dict.update({u'decoration': info[4]}) housefloor = name.find("div", {"class": "positionInfo"}) info_housefloor = housefloor.get_text().split('/') info_years = info_housefloor[1].strip( ).encode("utf-8").split( '年建' ) #unicode作为python中间编码,先转化成utf8(decode:...->unicode,encode:unicode->...) info_floor = info_housefloor[0].split('(') info_buildheight = info_floor[1].encode("utf-8").rstrip( '层)').lstrip('共') info_dict.update({u'years': info_years[0].strip()}) info_dict.update({u'buildingtype': info_years[1].strip()}) info_dict.update({u'floor': info_floor[0].strip()}) info_dict.update( {u'buildheight': info_buildheight.strip()}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update( {u'followInfo': followInfo.get_text().strip()}) taxfree = name.find("span", {"class": "taxfree"}) if taxfree == None: five = name.find("span", {"class": "five"}) if five == None: info_dict.update({u"taxtype": ""}) else: info_dict.update( {u"taxtype": five.get_text().strip()}) else: info_dict.update( {u"taxtype": taxfree.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update( {u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {u'unitPrice': unitPrice.get("data-price")}) except: continue # Houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"] }) #model.Houseinfo.insert(**info_dict).upsert().execute() #model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute() with model.database.atomic(): model.Houseinfo.insert_many(data_source).upsert().execute() model.Hisprice.insert_many(hisprice_data_source).upsert().execute() time.sleep(1)