def get_rent_perregion(city, district): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"zufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) print(total_pages) if total_pages == None: row = model.Rentinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) #todo for page in range(total_pages): if page > 0: url_page = baseUrl + u"zufang/%s/pg%d/" % (district, page) print(url_page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetRentByRegionlist", district, page + 1, total_pages) data_source = [] for ultag in soup.findAll("div", {"class": "content__list"}): for name in ultag.find_all('div'): i = i + 1 info_dict = {} try: housetitle = name.find( "p", {"class": "content__list--item--title"}) regionZone = name.find( "p", {"class": "content__list--item--des"}) region = regionZone.a.get_text().strip() zone = regionZone.a.next_sibling.next_sibling.get_text( ).strip() info_dict = get_detail_info(city, housetitle.a.get("href")) info_dict.update({u'region': region}) info_dict.update({u'zone': zone}) except: print 'traceback.format_exc():\n%s' % traceback.format_exc( ) continue # Rentinfo insert into mysql data_source.append(info_dict) # model.Rentinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): if data_source: model.Rentinfo.insert_many(data_source).upsert().execute() time.sleep(1)
def get_community_perregion(regionname=u'xicheng'): url = BASE_URL + u"xiaoqu/" + regionname + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Community.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"xiaoqu/" + regionname + "/pg%d/" % (page + 1, ) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("li", {"class": "clear"}) i = 0 for name in nameList: # Per house loop i = i + 1 info_dict = {} try: communitytitle = name.find("div", {"class": "title"}) info_dict.update( {u'title': communitytitle.get_text().strip('\n')}) info_dict.update({u'link': communitytitle.a.get('href')}) district = name.find("a", {"class": "district"}) info_dict.update({u'district': district.get_text()}) bizcircle = name.find("a", {"class": "bizcircle"}) info_dict.update({u'bizcircle': bizcircle.get_text()}) tagList = name.find("div", {"class": "tagList"}) info_dict.update({u'tagList': tagList.get_text().strip('\n')}) onsale = name.find("a", {"class": "totalSellCount"}) info_dict.update( {u'onsale': onsale.span.get_text().strip('\n')}) info_dict.update({u'id': name.get('data-housecode')}) except: continue # communityinfo insert into mysql model.Community.insert(**info_dict).upsert().execute() time.sleep(1)
def get_house_perregion(city, district): try: baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"ershoufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages is None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + u"ershoufang/%s/pg%d/" % (district, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 logging.info("Progress: %s %s: current page %s total pages %s", "GetHouseByRegionlist", district, page + 1, total_pages) data_source = [] hisprice_data_source = [] for ultag in soup.findAll("ul", {"class": "sellListContent"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('data-housecode') info_dict.update({u'houseID': houseID}) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('|') if len(info) > 0: info_dict.update({u'housetype': info[0]}) else: info_dict.update({u'housetype': ""}) if len(info) > 1: info_dict.update({u'square': info[1]}) else: info_dict.update({u'square': ""}) if len(info) > 2: info_dict.update({u'direction': info[2]}) else: info_dict.update({u'direction': ""}) if len(info) > 3: info_dict.update({u'decoration': info[3]}) else: info_dict.update({u'decoration': ""}) if len(info) > 4: info_dict.update({u'floor': info[4]}) else: info_dict.update({u'floor': ""}) if len(info) > 5: info_dict.update({u'years': info[5]}) else: info_dict.update({u'years': ""}) if len(info) > 6: info_dict.update({u'towertype': info[6]}) else: info_dict.update({u'towertype': ""}) housefloor = name.find("div", {"class": "positionInfo"}) communityInfo = housefloor.get_text().split('-') info_dict.update({u'community': communityInfo[0]}) if len(communityInfo) > 1: info_dict.update({u'business': communityInfo[1]}) else: info_dict.update({u'business': ""}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update( {u'followInfo': followInfo.get_text().strip()}) taxfree = name.find("span", {"class": "taxfree"}) if taxfree is None: info_dict.update({u"taxtype": ""}) else: info_dict.update( {u"taxtype": taxfree.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update( {u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {u'unitPrice': unitPrice.get("data-price")}) except Exception as e: print(e, traceback.print_exc()) continue # Houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append( {"houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"]}) # model.Houseinfo.insert(**info_dict).upsert().execute() # model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute() with model.database.atomic(): if data_source: logging.info("get_house_perregion: insert %d region info to database", len(data_source)) model.Houseinfo.insert_many(data_source).upsert().execute() if hisprice_data_source: logging.info("get_house_perregion: insert %d region info to database", len(hisprice_data_source)) model.Hisprice.insert_many( hisprice_data_source).upsert().execute() # time.sleep(1) except Exception as e: print(e, traceback.print_exc())
def get_community_perregion(city, regionname=u'xicheng'): try: baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"xiaoqu/" + regionname + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages is None: row = model.Community.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + u"xiaoqu/" + regionname + "/pg%d/" % page source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("li", {"class": "clear"}) i = 0 logging.info("Progress: %s %s: current page %s total pages %s", "GetCommunityByRegionlist", regionname, page + 1, total_pages) data_source = [] for name in nameList: # Per house loop i = i + 1 info_dict = {} try: communitytitle = name.find("div", {"class": "title"}) title = communitytitle.get_text().strip('\n') link = communitytitle.a.get('href') info_dict.update({u'title': title}) info_dict.update({u'link': link}) district = name.find("a", {"class": "district"}) info_dict.update({u'district': district.get_text()}) bizcircle = name.find("a", {"class": "bizcircle"}) info_dict.update({u'bizcircle': bizcircle.get_text()}) tagList = name.find("div", {"class": "tagList"}) info_dict.update({u'tagList': tagList.get_text().strip('\n')}) onsale = name.find("a", {"class": "totalSellCount"}) info_dict.update( {u'onsale': onsale.span.get_text().strip('\n')}) onrent = name.find("a", {"title": title + u"租房"}) info_dict.update( {u'onrent': onrent.get_text().strip('\n').split(u'套')[0]}) info_dict.update({u'id': name.get('data-housecode')}) price = name.find("div", {"class": "totalPrice"}) info_dict.update({u'price': price.span.get_text().strip('\n')}) communityinfo = get_communityinfo_by_url(link) for key, value in communityinfo.iteritems(): info_dict.update({key: value}) info_dict.update({u'city': city}) except Exception as e: print(e, traceback.print_exc()) continue # communityinfo insert into mysql data_source.append(info_dict) with model.database.atomic(): if data_source: logging.info("get_community_perregion: insert %d community info to database", len(data_source)) model.Community.insert_many(data_source).upsert().execute() # time.sleep(1) except Exception as e: print(e, traceback.print_exc())
def get_rent_percommunity(city, communityname): try: baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"zufang/rs" + \ urllib2.quote(communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): logging.info('soup error') return total_pages = misc.get_total_pages(url) if total_pages is None: row = model.Rentinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + u"rent/pg%drs%s/" % (page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 data_source = [] for ultag in soup.findAll("div", {"class": "content__list"}): for name in ultag.find_all("div", {"class": "content__list--item"}): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "content__list--item--main"}) info_dict.update({u'title': housetitle.find('p', {"class": "content__list--item--title"}).find('a', {'class': "twoline"}).get_text().strip()}) subway = name.find('i', {"class": "content__item__tag--is_subway_house"}) if subway is None: info_dict.update({u'subway': ""}) else: info_dict.update({u'subway': subway.get_text().strip()}) decoration = name.find('i', {"class": 'content__item__tag--decoration'}) if decoration is None: info_dict.update({u'decoration': ""}) else: info_dict.update( {u'decoration': decoration.get_text().strip()}) houseID = housetitle.a.get( 'href').split("/")[-1].split(".")[0] info_dict.update({u'houseID': houseID}) region = name.find('p', {"class": "content__list--item--des"}).find('a') if region is None: region = "" else: region = region.get_text().strip() info_dict.update({u'region': region}) zone = name.find('p', {"class": "content__list--item--des"}).find_all('a')[1] if zone is None: zone = "" else: zone = zone.get_text().strip() info_dict.update({u'zone': zone}) price = name.find("span", {"class": "content__list--item-price"}) if price is None: price = "" else: price = price.get_text().strip() info_dict.update({u'price': price}) heating = name.find("i", {"class": "content__item__tag--central_heating"}) if heating is None: heating = "" else: heating = heating.get_text().strip() info_dict.update({u'heating': heating}) other = name.find('p', {"class": "content__list--item--des"}) if other is not None: other = other.get_text().replace('\n', '').replace(' ', '').strip() else: other = "-/-/-/-/-" info_dict.update({u'other': other}) # position, meters, direction, rooms, desc = other.split('/') # info_dict.update({u'meters': meters}) info_dict.update({u'meters': ""}) pricepre = '' info_dict.update({u'pricepre': pricepre}) info_dict.update({u'link': 'https://bj.lianjia.com/zufang' + housetitle.a.get('href')}) except Exception as e: print(e, traceback.print_exc()) data_source.append(info_dict) with model.database.atomic(): if data_source: logging.info("get_rent_percommunity: insert %d rent info to database", len(data_source)) model.Rentinfo.insert_many(data_source).upsert().execute() # time.sleep(1) except Exception as e: print(e, traceback.print_exc())
def get_rent_perregion(district): url = BASE_URL + u"zufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Rentinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"zufang/%s/pg%d/" % (district, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetRentByRegionlist", district, page + 1, total_pages) data_source = [] for ultag in soup.findAll("ul", {"class": "house-lst"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "info-panel"}) info_dict.update( {u'title': housetitle.h2.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get("href")}) houseID = name.get("data_analysis-housecode") info_dict.update({u'houseID': houseID}) region = name.find("span", {"class": "region"}) info_dict.update({u'region': region.get_text().strip()}) zone = name.find("span", {"class": "zone"}) info_dict.update({u'zone': zone.get_text().strip()}) meters = name.find("span", {"class": "meters"}) info_dict.update({u'meters': meters.get_text().strip()}) other = name.find("div", {"class": "con"}) info_dict.update({u'other': other.get_text().strip()}) subway = name.find("span", {"class": "fang-subway-ex"}) if subway == None: info_dict.update({u'subway': ""}) else: info_dict.update( {u'subway': subway.span.get_text().strip()}) decoration = name.find("span", {"class": "decoration-ex"}) if decoration == None: info_dict.update({u'decoration': ""}) else: info_dict.update({ u'decoration': decoration.span.get_text().strip() }) heating = name.find("span", {"class": "heating-ex"}) if decoration == None: info_dict.update({u'heating': ""}) else: info_dict.update( {u'heating': heating.span.get_text().strip()}) price = name.find("div", {"class": "price"}) info_dict.update( {u'price': int(price.span.get_text().strip())}) pricepre = name.find("div", {"class": "price-pre"}) info_dict.update( {u'pricepre': pricepre.get_text().strip()}) except: continue # Rentinfo insert into mysql data_source.append(info_dict) # model.Rentinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): model.Rentinfo.insert_many(data_source).upsert().execute() time.sleep(1)
def get_community_perregion(city, regionname=u'xicheng'): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"xiaoqu/" + regionname + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Community.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(1,total_pages+1): if page > 1: url_page = baseUrl + u"xiaoqu/" + regionname + "/pg%d/" % page source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("li", {"class": "clear xiaoquListItem"}) i = 0 log_progress("GetCommunityByRegionlist", regionname, page, total_pages) data_source = [] for name in nameList: # Per house loop i = i + 1 info_dict = {} try: communitytitle = name.find("div", {"class": "title"}) title = communitytitle.get_text().strip('\n') link = communitytitle.a.get('href') info_dict.update({u'title': title}) info_dict.update({u'link': link}) district = name.find("a", {"class": "district"}) info_dict.update({u'district': district.get_text()}) bizcircle = name.find("a", {"class": "bizcircle"}) info_dict.update({u'bizcircle': bizcircle.get_text()}) tagList = name.find("div", {"class": "tagList"}) info_dict.update({u'tagList': tagList.get_text().strip('\n')}) onsale = name.find("a", {"class": "totalSellCount"}) info_dict.update( {u'onsale': onsale.span.get_text().strip('\n')}) onrent = name.find("a", {"title": title + u"租房"}) info_dict.update( {u'onrent': onrent.get_text().strip('\n').split(u'套')[0]}) info_dict.update({u'id': name.get('data-housecode')}) price = name.find("div", {"class": "totalPrice"}) info_dict.update({u'price': price.span.get_text().strip('\n')}) communityinfo = get_communityinfo_by_url(link) for key, value in communityinfo.iteritems(): info_dict.update({key: value}) info_dict.update({u'city': city}) info_dict.update({u'validdate': datetime.datetime.now()}) data_source.append(info_dict) model.Community.insert(info_dict).on_conflict(conflict_target=[model.Community.id], preserve=[model.Community.title, model.Community.link, model.Community.district, \ model.Community.bizcircle, model.Community.tagList, model.Community.onsale, model.Community.onrent, model.Community.year, \ model.Community.housetype, model.Community.cost, model.Community.service, \ model.Community.company, model.Community.building_num, model.Community.house_num, \ model.Community.price, model.Community.city,model.Community.validdate],update={}).execute() except Exception as e: print "except~~!!" logging.error(e) print info_dict # data_source.append(info_dict) continue time.sleep(1)
def get_house_perregion(district): url = BASE_URL + u"ershoufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'html5lib') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): url_page = BASE_URL + u"ershoufang/%s/pg%d/" % (district, page + 1) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'html5lib') i = 0 log_progress("GetHouseByRegionlist", district, page + 1, total_pages) data_source = [] hisprice_data_source = [] for ultag in soup.findAll("ul", {"class": "sellListContent"}): namearr = ultag.find_all('li', {"class": "clear"}) for name in namearr: i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('data-housecode') info_dict.update({u'houseID': houseID}) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('/') info_communityid = houseinfo.a.get('href').split('xiaoqu/') communityid = info_communityid[1].strip().rstrip('/') square_info = info[2].encode("utf-8").split('平米') info_dict.update({u'community': info[0]}) info_dict.update({u'communityid': communityid}) info_dict.update({u'housetype': info[1]}) info_dict.update({u'square': square_info[0]}) info_dict.update({u'direction': info[3]}) info_dict.update({u'decoration': info[4]}) housefloor = name.find("div", {"class": "positionInfo"}) info_housefloor = housefloor.get_text().split('/') info_years = info_housefloor[1].strip( ).encode("utf-8").split( '年建' ) #unicode作为python中间编码,先转化成utf8(decode:...->unicode,encode:unicode->...) info_floor = info_housefloor[0].split('(') info_buildheight = info_floor[1].encode("utf-8").rstrip( '层)').lstrip('共') info_dict.update({u'years': info_years[0].strip()}) info_dict.update({u'buildingtype': info_years[1].strip()}) info_dict.update({u'floor': info_floor[0].strip()}) info_dict.update( {u'buildheight': info_buildheight.strip()}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update( {u'followInfo': followInfo.get_text().strip()}) taxfree = name.find("span", {"class": "taxfree"}) if taxfree == None: five = name.find("span", {"class": "five"}) if five == None: info_dict.update({u"taxtype": ""}) else: info_dict.update( {u"taxtype": five.get_text().strip()}) else: info_dict.update( {u"taxtype": taxfree.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update( {u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {u'unitPrice': unitPrice.get("data-price")}) except: continue # Houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"] }) #model.Houseinfo.insert(**info_dict).upsert().execute() #model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute() with model.database.atomic(): model.Houseinfo.insert_many(data_source).upsert().execute() model.Hisprice.insert_many(hisprice_data_source).upsert().execute() time.sleep(1)
def get_house_perregion(city, district): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"ershoufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + u"ershoufang/%s/pg%d/" % (district, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetHouseByRegionlist", district, page + 1, total_pages) data_source = [] hisprice_data_source = [] for ultag in soup.findAll("ul", {"class": "sellListContent"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update( {u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('data-housecode') info_dict.update({u'houseID': houseID}) houseinfo = name.find("div", {"class": "houseInfo"}) if city == 'bj': info = houseinfo.get_text().split('/') else: info = houseinfo.get_text().split('|') info_dict.update({u'community': info[0]}) info_dict.update({u'housetype': info[1]}) info_dict.update({u'square': info[2]}) info_dict.update({u'direction': info[3]}) info_dict.update({u'decoration': info[4]}) housefloor = name.find("div", {"class": "positionInfo"}) info_dict.update({u'years': housefloor.get_text().strip()}) info_dict.update({u'floor': housefloor.get_text().strip()}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update( {u'followInfo': followInfo.get_text().strip()}) taxfree = name.find("span", {"class": "taxfree"}) if taxfree == None: info_dict.update({u"taxtype": ""}) else: info_dict.update( {u"taxtype": taxfree.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update( {u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {u'unitPrice': unitPrice.get("data-price")}) info_dict.update({u'validdate': datetime.datetime.now()}) model.Houseinfo.insert(info_dict).on_conflict(conflict_target=[model.Houseinfo.houseID], \ preserve=[model.Houseinfo.title, model.Houseinfo.link, model.Houseinfo.community, \ model.Houseinfo.years, model.Houseinfo.housetype, model.Houseinfo.square, \ model.Houseinfo.direction, model.Houseinfo.floor, model.Houseinfo.taxtype, \ model.Houseinfo.totalPrice, model.Houseinfo.unitPrice, model.Houseinfo.followInfo, \ model.Houseinfo.decoration, model.Houseinfo.validdate],update={}).execute() hisprice = {"houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"]} model.Hisprice.insert(hisprice).on_conflict(conflict_target = [model.Hisprice.houseID, model.Hisprice.totalPrice],\ preserve=[model.Hisprice.houseID,model.Hisprice.totalPrice,model.Hisprice.date],\ update={}).execute() except Exception as e: print "except~~!!" logging.error(e) print info_dict # data_source.append(info_dict) continue with model.database.atomic(): if data_source: model.Houseinfo.insert_many(data_source).upsert().execute() if hisprice_data_source: model.Hisprice.insert_many( hisprice_data_source).upsert().execute() time.sleep(1)
def get_lists_perword(keyword): url = BASE_URL % (keyword, pg_no) total_pages = misc.get_total_pages(url) print('The total pages number is: ', total_pages) logging.info('The total pages number is: ' + str(total_pages)) time.sleep(3) if total_pages == None: row = model.quickinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): url_page = BASE_URL % (keyword, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') itemList = soup.findAll("div", {"class": "result-sherlock-cell"}) print('The items are: ', len(itemList)) logging.info('The items are: ' + str(len(itemList))) for item in itemList: info_dict = {} try: id = item.get('id') info_dict.update({'id': id}) link_title = item.find("h3", {"class": "job-title"}) info_dict.update({'title': link_title.get_text()}) info_dict.update({'link': link_title.a.get('href')}) if item.find("li", {"itemprop": "description"}): descriptions = item.findAll("li", {"itemprop": "description"}) summary_list = [desc.get_text() for desc in descriptions] summary = ' | '.join(summary_list) info_dict.update({'summary': summary}) else: info_dict.update({'summary': 'no description'}) if item.find('p', {'class': 'job-quickinfo-salary'}): wage = item.find('p', { 'class': 'job-quickinfo-salary' }).get_text() info_dict.update({'salary': wage}) else: info_dict.update({'salary': 'not specified'}) date = item.find("div", { "class": "job-quickinfo" }).meta.get('content') info_dict.update({'postdate': date}) label = keyword.replace('+', ' ') info_dict.update({'label': label}) # print(info_dict) except: # print('There is no item found') logging.info('The item of searching results is not found') continue quickinfo.insert(**info_dict).upsert().execute() time.sleep(5)
def get_community_perregion(city, regionname=u'xicheng'): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"xiaoqu/" + regionname + "/" source_code = misc.get_source_code(url) # logging.info('checking raw response') # print(source_code) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Community.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + u"xiaoqu/" + regionname + "/pg%d/" % page logging.info("fetching from %s", url_page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') # logging.info("querying for page %d content", page) nameList = soup.findAll("li", {"class": "xiaoquListItem"}) # logging.info("checking community list length: %d", len(nameList)) i = 0 log_progress("GetCommunityByRegionlist", regionname, page + 1, total_pages) data_source = [] for name in nameList: # Per house loop i = i + 1 info_dict = {} try: communitytitle = name.find("div", {"class": "title"}) title = communitytitle.get_text().strip('\n') link = communitytitle.a.get('href') info_dict.update({u'title': title}) info_dict.update({u'link': link}) district = name.find("a", {"class": "district"}) info_dict.update({u'district': district.get_text()}) bizcircle = name.find("a", {"class": "bizcircle"}) info_dict.update({u'bizcircle': bizcircle.get_text()}) tagList = name.find("div", {"class": "tagList"}) info_dict.update({u'tagList': tagList.get_text().strip('\n')}) onsale = name.find("a", {"class": "totalSellCount"}) info_dict.update( {u'onsale': onsale.span.get_text().strip('\n')}) onrent = name.find("a", {"title": title + u"租房"}) info_dict.update( {u'onrent': onrent.get_text().strip('\n').split(u'套')[0]}) info_dict.update({u'id': name.get('data-housecode')}) price = name.find("div", {"class": "totalPrice"}) info_dict.update({u'price': price.span.get_text().strip('\n')}) communityinfo = get_communityinfo_by_url(link) for key, value in communityinfo.items(): info_dict.update({key: value}) info_dict.update({u'city': city}) # logging.info('community info: %s', json.dumps(info_dict)) except Exception as e: traceback.print_exc() continue # communityinfo insert into mysql data_source.append(info_dict) # model.Community.insert(**info_dict).upsert().execute() with model.database.atomic(): if data_source: # logging.info("checking data: %s", ''.join(data_source)) # logging.info("inserting community info into db") model.Community.insert_many(data_source).upsert().execute() # logging.info("insertion succeeds") time.sleep(1)
def get_sell_percommunity(city, communityname, threads=30): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"chengjiao/rs" + \ urllib2.quote(communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Sellinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + \ u"chengjiao/pg%drs%s/" % (page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') log_progress("GetSellByCommunitylist", communityname, page + 1, total_pages) # logging.info("start") data_source = [] nameList = [] for ultag in soup.findAll("ul", {"class": "listContent"}): for name in ultag.find_all('li'): nameList.append(name) info_ls_mult = Manager().list() nameid_q = Queue() for i in range(len(nameList)): nameid_q.put(i) processes = [] try: for i in range(threads): proc = Process(target=get_sell_worker, args=(nameid_q, nameList, info_ls_mult, communityname, city,)) processes.append(proc) proc.start() for proc in processes: proc.join() except KeyboardInterrupt: print("Emergency terminate") print("killing %d processes" % (len(processes))) for proc in processes: proc.terminate() data_source = list(info_ls_mult) if len(data_source) == 0: print(colored("sth is wrong with %s, give up on this one" % communityname, "red")) break print("Finished with %d at %s" % (len(data_source), communityname)) with model.database.atomic(): if data_source: model.Sellinfo.insert_many(data_source).upsert().execute() logging.info("Writing to database") time.sleep(1)
def get_community_perregion(city, regionname=u'xicheng', threads=30): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"xiaoqu/" + regionname + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Community.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): try: if page > 0: url_page = baseUrl + u"xiaoqu/" + regionname + "/pg%d/" % page source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("li", {"class": "clear"}) i = 0 log_progress("GetCommunityByRegionlist", regionname, page + 1, total_pages) data_source = [] # for name in nameList[:1]: # Per house loop DEBUGGING info_ls_mult = Manager().list() nameid_q = Queue() for i in range(len(nameList)): nameid_q.put(i) processes = [] try: for i in range(threads): proc = Process(target=community_info_worker, args=(nameid_q, nameList, info_ls_mult, regionname, city,)) processes.append(proc) proc.start() for proc in processes: proc.join() except KeyboardInterrupt: print("Emergency terminate") print("killing %d processes" % (len(processes))) for proc in processes: proc.terminate() data_source = list(info_ls_mult) if len(data_source) == 0: print(colored("sth is wrong with %s, give up on this one" % regionname, "red")) break print("Finished with %d at %s" % (len(data_source), regionname)) with model.database.atomic(): print("submitting to dataset") if data_source: model.Community.insert_many(data_source).upsert().execute() if page % 4 == 0: cnt = [] for community in model.Community.select(): if community.city == city: cnt.append(community.title) print(" %d Community scraped: %d" % (page, len(cnt))) time.sleep(2) except: print(colored("Failed at %d - %s" % (page, regionname), "red")) continue
def get_rent_percommunity(communityname): url = BASE_URL + u"zufang/rs" + urllib.request.quote( communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Rentinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"rent/pg%drs%s/" % ( page, urllib.request.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetRentByCommunitylist", communityname, page + 1, total_pages) data_source = [] for ultag in soup.findAll("div", {"class": "content__list"}): for name in ultag.find_all("div", {"class": "content__list--item"}): log_progress("GetRentByCommunitylist", communityname, page + 1, total_pages) i = i + 1 info_dict = {} tempStr = '' try: housetitleOb = name.find( "p", {"class": "content__list--item--title"}) housetitle = housetitleOb.find("a") info_dict.update({u'title': housetitle.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('href').split("/")[-1].split( ".")[0] info_dict.update({u'houseID': houseID}) tempInfo = name.find("p", {"class": "content__list--item--des"}) for tempData in tempInfo.find_all("a"): tempStr += tempData.get_text().strip() info_dict.update({u'region': tempStr}) info_dict.update({u'zone': tempInfo.get_text().strip()}) info_dict.update({u'meters': tempInfo.get_text().strip()}) other = name.find("span", {"class": "hide"}) info_dict.update({u'other': other.get_text().strip()}) subway = name.find( "i", {"class": "content__item__tag--is_subway_house"}) if subway is None: info_dict.update({u'subway': ''}) else: info_dict.update( {u'subway': subway.span.get_text().strip()}) decoration = name.find( "i", {"class": "content__item__tag--decoration"}) if decoration is None: info_dict.update({u'decoration': ''}) else: info_dict.update({ u'decoration': decoration.span.get_text().strip() }) heating = name.find( "i", {"class": "content__item__tag--two_bathroom"}) if decoration is None: info_dict.update({u'heating': ''}) else: info_dict.update( {u'heating': heating.span.get_text().strip()}) priceT = name.find("span", {"class": "content__list--item-price"}) price = priceT.find("em") info_dict.update( {u'price': int(price.span.get_text().strip())}) pricepre = name.find( "p", {"class": "content__list--item--time oneline"}) info_dict.update( {u'pricepre': pricepre.get_text().strip()}) except: continue # Rentinfo insert into mysql data_source.append(info_dict) #model.Rentinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): model.Rentinfo.insert_many(data_source).upsert().execute() time.sleep(1)
def get_house_percommunity(communityname): url = BASE_URL + u"ershoufang/rs" + urllib.request.quote( communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: return #row = model.Houseinfo.select().count() #raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"ershoufang/pg%drs%s/" % ( page, urllib.request.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetHouseByCommunitylist", communityname, page + 1, total_pages) data_source = [] hisprice_data_source = [] for ultag in soup.findAll("ul", {"class": "sellListContent"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update( {u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('data-lj_action_housedel_id') if houseID == None: houseID = housetitle.a.get('data-housecode') if houseID == None: continue info_dict.update({u'houseID': houseID}) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('|') info_dict.update({u'housetype': info[0]}) info_dict.update({u'square': info[1]}) info_dict.update({u'direction': info[2]}) info_dict.update({u'decoration': info[3]}) info_dict.update({u'floor': info[4]}) info_dict.update({u'years': info[5]}) housefloor = name.find("div", {"class": "positionInfo"}) housefloorInfo = housefloor.get_text().split(' - ') info_dict.update({u'community': housefloorInfo[0]}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update( {u'followInfo': followInfo.get_text().strip()}) taxfree = name.find("span", {"class": "taxfree"}) if taxfree == None: info_dict.update({u"taxtype": ""}) else: info_dict.update( {u"taxtype": taxfree.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update( {u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {u'unitPrice': unitPrice.get("data-price")}) except: continue # Houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"] }) #model.Houseinfo.insert(**info_dict).upsert().execute() #model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute() break if data_source == []: continue with model.database.atomic(): logging.info("data_source : " + json.dumps(data_source)) model.Houseinfo.insert_many(data_source).upsert().execute() model.Hisprice.insert_many(hisprice_data_source).upsert().execute() time.sleep(1)
def get_house_percommunity(city, communityname): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"ershoufang/rs" + \ urllib2.quote(communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) #从第二页才开始存入。page1在初始化的时候就已经放进去了,无须修改链接 for page in range(1,total_pages+1): if page > 1: url_page = baseUrl + \ u"ershoufang/pg%drs%s/" % (page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') print url_page nameList = soup.findAll("li", {"class": "clear"}) i = 0 log_progress("GetHouseByCommunitylist", communityname, page, total_pages) data_source = [] hisprice_data_source = [] for name in nameList: # per house loop i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseaddr = name.find("div", {"class": "address"}) if city == 'bj': info = houseaddr.div.get_text().split('/') else: info = houseaddr.div.get_text().split('|') info_dict.update({u'community': communityname}) info_dict.update({u'housetype': info[1].strip()}) info_dict.update({u'square': info[2].strip()}) info_dict.update({u'direction': info[3].strip()}) info_dict.update({u'decoration': info[4].strip()}) housefloor = name.find("div", {"class": "flood"}) floor_all = housefloor.div.get_text().split( '-')[0].strip().split(' ') info_dict.update({u'floor': floor_all[0].strip()}) info_dict.update({u'years': floor_all[-1].strip()}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update({u'followInfo': followInfo.get_text()}) tax = name.find("div", {"class": "tag"}) info_dict.update({u'taxtype': tax.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update({u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update({u'unitPrice': unitPrice.get('data-price')}) info_dict.update({u'houseID': unitPrice.get('data-hid')}) info_dict.update({u'validdate': datetime.datetime.now()}) model.Houseinfo.insert(info_dict).on_conflict(conflict_target=[model.Houseinfo.houseID], \ preserve=[model.Houseinfo.title, model.Houseinfo.link, model.Houseinfo.community, \ model.Houseinfo.years, model.Houseinfo.housetype, model.Houseinfo.square, \ model.Houseinfo.direction, model.Houseinfo.floor, model.Houseinfo.taxtype, \ model.Houseinfo.totalPrice, model.Houseinfo.unitPrice, model.Houseinfo.followInfo, \ model.Houseinfo.decoration, model.Houseinfo.validdate],update={}).execute() hisprice = {"houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"]} model.Hisprice.insert(hisprice).on_conflict(conflict_target = [model.Hisprice.houseID, model.Hisprice.totalPrice],\ preserve=[model.Hisprice.houseID,model.Hisprice.totalPrice,model.Hisprice.date],\ update={}).execute() except Exception as e: logging.error(e) print info_dict continue # houseinfo insert into mysql time.sleep(1)
def get_house_percommunity(city, communityname): try: baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"ershoufang/rs" + urllib2.quote(communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages is None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + \ u"ershoufang/pg%drs%s/" % (page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("li", {"class": "clear"}) i = 0 logging.info("Progress: %s %s: current page %s total pages %s", "GetHouseByCommunitylist", communityname, page + 1, total_pages) data_source = [] hisprice_data_source = [] for name in nameList: # per house loop i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseaddr = name.find("div", {"class": "address"}) info = houseaddr.div.get_text().split('|') info_dict.update({u'community': communityname}) info_dict.update({u'housetype': info[0].strip()}) info_dict.update({u'square': info[1].strip()}) if len(info) > 6: info_dict.update({u'direction': info[2].strip() + "-" + info[6].strip()}) else: info_dict.update({u'direction': info[2].strip()}) info_dict.update({u'decoration': info[3].strip()}) if len(info) > 5: info_dict.update({u'years': info[5].strip()}) else: info_dict.update({u'years': ''}) if len(info) > 6: info_dict.update({u'towertype': info[6]}) else: info_dict.update({u'towertype': None}) housefloor = name.find("div", {"class": "positionInfo"}) communityInfo = housefloor.get_text().split('-') if len(communityInfo) > 1: info_dict.update({u'business': communityInfo[1]}) else: info_dict.update({u'business': None}) housefloor = name.find("div", {"class": "flood"}) floor_all = housefloor.div.get_text().split( '-')[0].strip().split(' ') if len(info) > 5: info_dict.update({u'floor': floor_all[0].strip() + '-' + info[4].strip()}) else: info_dict.update({u'floor': floor_all[0].strip() + '-'}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update({u'followInfo': followInfo.get_text()}) tax = name.find("div", {"class": "tag"}) info_dict.update({u'taxtype': tax.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update({u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update({u'unitPrice': unitPrice.get('data-price')}) info_dict.update({u'houseID': unitPrice.get('data-hid')}) except Exception as e: print(e, traceback.print_exc()) logging.info('parse error: %s', name) continue data_source.append(info_dict) hisprice_data_source.append( {"houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"]}) with model.database.atomic(): if data_source: logging.info("get_house_percommunity: insert %d house info to database", len(data_source)) model.Houseinfo.insert_many(data_source).upsert().execute() if hisprice_data_source: logging.info("get_house_percommunity: insert %d hisprice data source info to database", len(hisprice_data_source)) model.Hisprice.insert_many( hisprice_data_source).upsert().execute() except Exception as e: print(e, traceback.print_exc())
def get_house_perregion(city, district): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"ershoufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages is None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) #total_pages = 10 house_ids = set() for page in range(total_pages): if page > 0: url_page = baseUrl + u"ershoufang/%s/pg%d/" % (district, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetHouseByRegionlist", district, page + 1, total_pages) data_source = [] hisprice_data_source = [] ultags = soup.findAll("ul", {"class": "sellListContent"}) for ultag in ultags: for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update( {u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('data-housecode') # 对houseID进行去重 if houseID in house_ids: continue info_dict.update({u'houseID': houseID}) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('|') #info_dict.update({u'community': info[0]}) info_dict.update({u'housetype': info[0]}) info_dict.update({u'square': info[1]}) info_dict.update({u'direction': info[2]}) info_dict.update({u'decoration': info[3]}) info_dict.update({u'floor': info[4]}) info_dict.update({u'years': info[5]}) housefloor = name.find("div", {"class": "positionInfo"}) communityInfo = housefloor.get_text().split('-') info_dict.update({u'community': communityInfo[0]}) #info_dict.update({u'years': housefloor.get_text().strip()}) #info_dict.update({u'floor': housefloor.get_text().strip()}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update( {u'followInfo': followInfo.get_text().strip()}) taxfree = name.find("span", {"class": "taxfree"}) if taxfree is None: info_dict.update({u"taxtype": ""}) else: info_dict.update( {u"taxtype": taxfree.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update( {u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {u'unitPrice': unitPrice.get("data-price")}) info_dict.update({"validdate": datetime.datetime.now()}) except: continue # Houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"] }) house_ids.add(info_dict["houseID"]) with model.database.atomic(): try: for data in data_source: model.Houseinfo.insert(data).on_conflict( conflict_target=(model.Houseinfo.houseID, ), update=data, #preserve=(model.Houseinfo.houseID, ), ).execute() model.Hisprice.insert_many(hisprice_data_source).execute() except Exception as e: print("error: %s" % e) log_progress("GetHouseByRegionlist inserted", district, page + 1, total_pages) time.sleep(0.5)
def get_rent_percommunity(communityname): url = BASE_URL + u"zufang/rs" + urllib.parse.quote( communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Rentinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"rent/pg%drs%s/" % ( page + 1, urllib.parse.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 for ultag in soup.findAll("ul", {"class": "house-lst"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "info-panel"}) info_dict.update({u'title': housetitle.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = int( housetitle.a.get('href').split("/")[-1].split(".")[0]) info_dict.update({u'houseID': houseID}) region = name.find("span", {"class": "region"}) info_dict.update({u'region': region.get_text().strip()}) zone = name.find("span", {"class": "zone"}) info_dict.update({u'zone': zone.get_text().strip()}) meters = name.find("span", {"class": "meters"}) info_dict.update({u'meters': meters.get_text().strip()}) other = name.find("div", {"class": "con"}) info_dict.update({u'other': other.get_text().strip()}) subway = name.find("span", {"class": "fang-subway-ex"}) info_dict.update( {u'subway': subway.span.get_text().strip()}) decoration = name.find("span", {"class": "decoration-ex"}) info_dict.update( {u'decoration': decoration.span.get_text().strip()}) heating = name.find("span", {"class": "heating-ex"}) info_dict.update( {u'heating': heating.span.get_text().strip()}) price = name.find("div", {"class": "price"}) info_dict.update( {u'price': int(price.span.get_text().strip())}) pricepre = name.find("div", {"class": "price-pre"}) info_dict.update( {u'pricepre': pricepre.get_text().strip()}) except: continue # Rentinfo insert into mysql model.Rentinfo.insert(**info_dict).upsert().execute() time.sleep(1)
def get_community_perregion(regionname): url = BASE_URL + u"xiaoqu/" + regionname + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Community.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"xiaoqu/" + regionname + "/pg%d/" % page source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("li", {"class": "clear"}) i = 0 log_progress("GetCommunityByRegionlist", regionname, page + 1, total_pages) data_source = [] for name in nameList: # Per house loop i = i + 1 info_dict = {} try: communitytitle = name.find("div", {"class": "title"}) title = communitytitle.get_text().strip('\n') link = communitytitle.a.get('href') info_dict.update({u'title': title}) info_dict.update({u'link': link}) district = name.find("a", {"class": "district"}) info_dict.update({u'district': district.get_text()}) bizcircle = name.find("a", {"class": "bizcircle"}) info_dict.update({u'bizcircle': bizcircle.get_text()}) tagList = name.find("div", {"class": "tagList"}) info_dict.update({u'tagList': tagList.get_text().strip('\n')}) onsale = name.find("a", {"class": "totalSellCount"}) info_dict.update( {u'onsale': onsale.span.get_text().strip('\n')}) onrent = name.find("a", {"title": title + u"租房"}) info_dict.update( {u'onrent': onrent.get_text().strip('\n').split(u'套')[0]}) info_dict.update({u'id': name.get('data_analysis-housecode')}) price = name.find("div", {"class": "totalPrice"}) info_dict.update({u'price': price.span.get_text().strip('\n')}) communityinfo = get_communityinfo_by_url(link) for key, value in communityinfo.iteritems(): info_dict.update({key: value}) except: continue # communityinfo insert into mysql data_source.append(info_dict) # model.Community.insert(**info_dict).upsert().execute() with model.database.atomic(): model.Community.insert_many(data_source).upsert().execute() time.sleep(1)
def get_rent_percommunity(city, communityname): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"zufang/rs" + \ urllib.parse.quote(communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Rentinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + \ u"rent/pg%drs%s/" % (page, urllib.parse.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetRentByCommunitylist", communityname, page + 1, total_pages) data_source = [] for ultag in soup.findAll("ul", {"class": "house-lst"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "info-panel"}) info_dict.update({u'title': housetitle.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('href').split("/")[-1].split( ".")[0] info_dict.update({u'houseID': houseID}) region = name.find("span", {"class": "region"}) info_dict.update({u'region': region.get_text().strip()}) zone = name.find("span", {"class": "zone"}) info_dict.update({u'zone': zone.get_text().strip()}) meters = name.find("span", {"class": "meters"}) info_dict.update({u'meters': meters.get_text().strip()}) other = name.find("div", {"class": "con"}) info_dict.update({u'other': other.get_text().strip()}) subway = name.find("span", {"class": "fang-subway-ex"}) if subway is None: info_dict.update({u'subway': ""}) else: info_dict.update( {u'subway': subway.span.get_text().strip()}) decoration = name.find("span", {"class": "decoration-ex"}) if decoration is None: info_dict.update({u'decoration': ""}) else: info_dict.update({ u'decoration': decoration.span.get_text().strip() }) heating = name.find("span", {"class": "heating-ex"}) info_dict.update( {u'heating': heating.span.get_text().strip()}) price = name.find("div", {"class": "price"}) info_dict.update( {u'price': int(price.span.get_text().strip())}) pricepre = name.find("div", {"class": "price-pre"}) info_dict.update( {u'pricepre': pricepre.get_text().strip()}) except: continue # Rentinfo insert into mysql data_source.append(info_dict) # model.Rentinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): if data_source: model.Rentinfo.insert_many(data_source).upsert().execute() time.sleep(1)
def get_rent_percommunity(city, communityname): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"zufang/rs" + \ urllib2.quote(communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Rentinfo.select().count() print url raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + \ u"rent/pg%drs%s/" % (page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetRentByCommunitylist", communityname, page + 1, total_pages) data_source = [] for ultag in soup.findAll("ul", {"class": "house-lst"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "info-panel"}) info_dict.update({u'title': housetitle.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get( 'href').split("/")[-1].split(".")[0] info_dict.update({u'houseID': houseID}) region = name.find("span", {"class": "region"}) info_dict.update({u'region': region.get_text().strip()}) zone = name.find("span", {"class": "zone"}) info_dict.update({u'zone': zone.get_text().strip()}) meters = name.find("span", {"class": "meters"}) info_dict.update({u'meters': meters.get_text().strip()}) other = name.find("div", {"class": "con"}) info_dict.update({u'other': other.get_text().strip()}) subway = name.find("span", {"class": "fang-subway-ex"}) if subway is None: info_dict.update({u'subway': ""}) else: info_dict.update( {u'subway': subway.span.get_text().strip()}) decoration = name.find("span", {"class": "decoration-ex"}) if decoration is None: info_dict.update({u'decoration': ""}) else: info_dict.update( {u'decoration': decoration.span.get_text().strip()}) heating = name.find("span", {"class": "heating-ex"}) info_dict.update( {u'heating': heating.span.get_text().strip()}) price = name.find("div", {"class": "price"}) info_dict.update( {u'price': int(price.span.get_text().strip())}) pricepre = name.find("div", {"class": "price-pre"}) info_dict.update( {u'pricepre': pricepre.get_text().strip()}) info_dict.update({u'updatedate': datetime.datetime.now()}) model.Rentinfo.insert(info_dict).on_conflict(conflict_target=[model.Community.id], \ preserve=[model.Rentinfo.title, model.Rentinfo.link, \ model.Rentinfo.region, model.Rentinfo.zone, model.Rentinfo.meters, \ model.Rentinfo.other, model.Rentinfo.subway, model.Rentinfo.decoration, \ model.Rentinfo.heating, model.Rentinfo.price, model.Rentinfo.pricepre, \ model.Rentinfo.updatedate],update={}).execute() except Exception as e: print "except~~!!" logging.error(e) print info_dict # data_source.append(info_dict) continue time.sleep(1)
def get_sell_percommunity(communityname): url = BASE_URL + u"chengjiao/rs" + urllib2.quote( communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Sellinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"chengjiao/pg%drs%s/" % ( page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetSellByCommunitylist", communityname, page + 1, total_pages) data_source = [] for ultag in soup.findAll("ul", {"class": "listContent"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('href').split("/")[-1].split( ".")[0] info_dict.update({u'houseID': houseID.strip()}) house = housetitle.get_text().strip().split(' ') info_dict.update({u'community': house[0].strip()}) info_dict.update({u'housetype': house[1].strip()}) info_dict.update({u'square': house[2].strip()}) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('|') info_dict.update({u'direction': info[0].strip()}) info_dict.update({u'status': info[1].strip()}) housefloor = name.find("div", {"class": "positionInfo"}) floor_all = housefloor.get_text().strip().split(' ') info_dict.update({u'floor': floor_all[0].strip()}) info_dict.update({u'years': floor_all[-1].strip()}) followInfo = name.find("div", {"class": "source"}) info_dict.update( {u'source': followInfo.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) if totalPrice.span is None: info_dict.update( {u'totalPrice': totalPrice.get_text().strip()}) else: info_dict.update({ u'totalPrice': totalPrice.span.get_text().strip() }) unitPrice = name.find("div", {"class": "unitPrice"}) if unitPrice.span is None: info_dict.update( {u'unitPrice': unitPrice.get_text().strip()}) else: info_dict.update( {u'unitPrice': unitPrice.span.get_text().strip()}) dealDate = name.find("div", {"class": "dealDate"}) info_dict.update({ u'dealdate': dealDate.get_text().strip().replace('.', '-') }) except: continue # Sellinfo insert into mysql data_source.append(info_dict) # model.Sellinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): model.Sellinfo.insert_many(data_source).upsert().execute() time.sleep(1)
def get_house_percommunity(city, communityname): baseUrl = u"http://%s.lianjia.com/" % (city) normal_housing = "sf1" url = baseUrl + u"ershoufang/%srs%s/" % ( normal_housing, urllib.parse.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + \ u"ershoufang/pg%d%srs%s/" % (page, normal_housing, urllib.parse.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("li", {"class": "clear"}) i = 0 log_progress("GetHouseByCommunitylist", communityname, page + 1, total_pages) data_source = [] hisprice_data_source = [] for name in nameList: # per house loop i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseaddr = name.find("div", {"class": "address"}) info = houseaddr.div.get_text().split('|') info_dict.update({u'community': communityname}) info_dict.update({u'housetype': info[0].strip()}) info_dict.update({u'square': info[1].strip()[:-2]}) info_dict.update({u'direction': info[2].strip()}) info_dict.update({u'decoration': info[3].strip()}) info_dict.update({u'floor': info[4].strip()}) info_dict.update( {u'years': info[5].strip()[:info[5].strip().index("年")]}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update({u'followInfo': followInfo.get_text()}) tag = name.find("div", {"class": "tag"}) info_dict.update({u'tagtype': tag.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update({u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update({u'unitPrice': unitPrice.get('data-price')}) info_dict.update({u'houseID': unitPrice.get('data-hid')}) except: continue # houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"] }) with model.database.atomic(): if data_source: model.Houseinfo.replace_many(data_source).execute() if hisprice_data_source: model.Hisprice.replace_many(hisprice_data_source).execute() time.sleep(1)
def get_house_perregion(district): url = BASE_URL + u"ershoufang/%s/" % district source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"ershoufang/%s/pg%d/" % (district, page) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') i = 0 log_progress("GetHouseByRegionlist", district, page + 1, total_pages) data_source = [] hisprice_data_source = [] for ultag in soup.findAll("ul", {"class": "sellListContent"}): for name in ultag.find_all('li'): i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update( {u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get('data_analysis-housecode') info_dict.update({u'houseID': houseID}) houseinfo = name.find("div", {"class": "houseInfo"}) if CITY == 'bj': info = houseinfo.get_text().split('/') else: info = houseinfo.get_text().split('|') info_dict.update({u'community': info[0]}) info_dict.update({u'housetype': info[1]}) info_dict.update({u'square': info[2]}) info_dict.update({u'direction': info[3]}) info_dict.update({u'decoration': info[4]}) housefloor = name.find("div", {"class": "positionInfo"}) info_dict.update({u'years': housefloor.get_text().strip()}) info_dict.update({u'floor': housefloor.get_text().strip()}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update( {u'followInfo': followInfo.get_text().strip()}) taxfree = name.find("span", {"class": "taxfree"}) if taxfree == None: info_dict.update({u"taxtype": ""}) else: info_dict.update( {u"taxtype": taxfree.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update( {u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {u'unitPrice': unitPrice.get("data_analysis-price")}) except: continue # Houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"] }) # model.Houseinfo.insert(**info_dict).upsert().execute() # model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute() with model.database.atomic(): model.Houseinfo.insert_many(data_source).upsert().execute() model.Hisprice.insert_many(hisprice_data_source).upsert().execute() time.sleep(1)
def get_sell_percommunity(city, communityname): try: baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"chengjiao/rs" + \ urllib2.quote(communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages is None: row = model.Sellinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = baseUrl + \ u"chengjiao/pg%drs%s/" % (page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') logging.info("Progress: %s %s: current page %s total pages %s", "GetSellByCommunitylist", communityname, page + 1, total_pages) data_source = [] for ultag in soup.findAll("ul", {"class": "listContent"}): for name in ultag.find_all('li'): info_dict = {} try: try: dealinfo = name.find('div', {'class': 'dealCycleeInfo'}).find('span', {'class': 'dealCycleTxt'}) info_dict.update({u'dealinfo': dealinfo.get_text().strip()}) # 挂牌价和交易周期 except Exception as e: info_dict.update({u'dealinfo': ''}) housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get( 'href').split("/")[-1].split(".")[0] info_dict.update({u'houseID': houseID.strip()}) house = housetitle.get_text().strip().split(' ') info_dict.update({u'community': communityname}) info_dict.update( {u'housetype': house[1].strip() if len(house) > 1 else ''}) info_dict.update( {u'square': house[2].strip() if len(house) > 2 else ''}) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('|') info_dict.update({u'direction': info[0].strip()}) info_dict.update( {u'status': info[1].strip() if len(info) > 1 else ''}) housefloor = name.find("div", {"class": "positionInfo"}) floor_all = housefloor.get_text().strip().split(' ') info_dict.update({u'floor': floor_all[0].strip()}) info_dict.update({u'years': floor_all[-1].strip()}) followInfo = name.find("div", {"class": "source"}) info_dict.update( {u'source': followInfo.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) if totalPrice.span is None: info_dict.update( {u'totalPrice': totalPrice.get_text().strip()}) else: info_dict.update( {u'totalPrice': totalPrice.span.get_text().strip()}) unitPrice = name.find("div", {"class": "unitPrice"}) if unitPrice.span is None: info_dict.update( {u'unitPrice': unitPrice.get_text().strip()}) else: info_dict.update( {u'unitPrice': unitPrice.span.get_text().strip()}) dealDate = name.find("div", {"class": "dealDate"}) info_dict.update( {u'dealdate': dealDate.get_text().strip().replace('.', '-')}) except Exception as e: print(e, traceback.print_exc()) data_source.append(info_dict) # model.Sellinfo.insert(**info_dict).upsert().execute() with model.database.atomic(): if data_source: logging.info("get_sell_percommunity: insert %d sell info data source info to database", len(data_source)) model.Sellinfo.insert_many(data_source).upsert().execute() except Exception as e: print(e, traceback.print_exc())
def get_house_percommunity(communityname): url = BASE_URL + u"ershoufang/rs" + urllib2.quote( communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Houseinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"ershoufang/pg%drs%s/" % ( page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("li", {"class": "clear"}) i = 0 log_progress("GetHouseByCommunitylist", communityname, page + 1, total_pages) data_source = [] hisprice_data_source = [] for name in nameList: # per house loop i = i + 1 info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.a.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseaddr = name.find("div", {"class": "address"}) if CITY == 'bj': info = houseaddr.div.get_text().split('/') else: info = houseaddr.div.get_text().split('|') info_dict.update({u'community': info[0].strip()}) info_dict.update({u'housetype': info[1].strip()}) info_dict.update({u'square': info[2].strip()}) info_dict.update({u'direction': info[3].strip()}) info_dict.update({u'decoration': info[4].strip()}) housefloor = name.find("div", {"class": "flood"}) floor_all = housefloor.div.get_text().split( '-')[0].strip().split(' ') info_dict.update({u'floor': floor_all[0].strip()}) info_dict.update({u'years': floor_all[-1].strip()}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update({u'followInfo': followInfo.get_text()}) tax = name.find("div", {"class": "tag"}) info_dict.update({u'taxtype': tax.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update({u'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {u'unitPrice': unitPrice.get('data_analysis-price')}) info_dict.update( {u'houseID': unitPrice.get('data_analysis-hid')}) except: continue # houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"] }) # model.Houseinfo.insert(**info_dict).upsert().execute() # model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute() with model.database.atomic(): model.Houseinfo.insert_many(data_source).upsert().execute() model.Hisprice.insert_many(hisprice_data_source).upsert().execute() time.sleep(1)
def get_sell_percommunity(city, communityname): baseUrl = u"http://%s.lianjia.com/" % (city) url = baseUrl + u"chengjiao/rs" + \ urllib2.quote(communityname.encode('utf8')) + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_total_pages(url) if total_pages == None: row = model.Sellinfo.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) #从第二页才开始存入。page1在初始化的时候就已经放进去了,无须修改链接 for page in range(1,total_pages+1): if page > 1: url_page = baseUrl + \ u"chengjiao/pg%drs%s/" % (page, urllib2.quote(communityname.encode('utf8'))) source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') log_progress("GetSellByCommunitylist", communityname, page, total_pages) data_source = [] for ultag in soup.findAll("ul", {"class": "listContent"}): for name in ultag.find_all('li'): info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({u'title': housetitle.get_text().strip()}) info_dict.update({u'link': housetitle.a.get('href')}) houseID = housetitle.a.get( 'href').split("/")[-1].split(".")[0] info_dict.update({u'houseID': houseID.strip()}) house = housetitle.get_text().strip().split(' ') info_dict.update({u'community': communityname}) info_dict.update( {u'housetype': house[1].strip() if 1 < len(house) else ''}) info_dict.update( {u'square': house[2].strip() if 2 < len(house) else ''}) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('|') info_dict.update({u'direction': info[0].strip()}) info_dict.update( {u'status': info[1].strip() if 1 < len(info) else ''}) housefloor = name.find("div", {"class": "positionInfo"}) floor_all = housefloor.get_text().strip().split(' ') info_dict.update({u'floor': floor_all[0].strip()}) info_dict.update({u'years': floor_all[-1].strip()}) followInfo = name.find("div", {"class": "source"}) info_dict.update( {u'source': followInfo.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) if totalPrice.span is None: info_dict.update( {u'totalPrice': totalPrice.get_text().strip()}) else: info_dict.update( {u'totalPrice': totalPrice.span.get_text().strip()}) unitPrice = name.find("div", {"class": "unitPrice"}) if unitPrice.span is None: info_dict.update( {u'unitPrice': unitPrice.get_text().strip()}) else: info_dict.update( {u'unitPrice': unitPrice.span.get_text().strip()}) dealDate = name.find("div", {"class": "dealDate"}) info_dict.update( {u'dealdate': dealDate.get_text().strip().replace('.', '-')}) info_dict.update({u'updatedate': datetime.datetime.now()}) model.Sellinfo.insert(info_dict).on_conflict(conflict_target=[model.Sellinfo.houseID], \ preserve=[model.Sellinfo.title, model.Sellinfo.link, model.Sellinfo.community, \ model.Sellinfo.years, model.Sellinfo.housetype, model.Sellinfo.square, \ model.Sellinfo.direction, model.Sellinfo.floor, model.Sellinfo.status, \ model.Sellinfo.source, model.Sellinfo.totalPrice, model.Sellinfo.unitPrice, \ model.Sellinfo.dealdate, model.Sellinfo.updatedate],update={}).execute() except Exception as e: logging.error(e) print info_dict continue time.sleep(1)