def get_totalpage(url): source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') info = soup.find("div", {"class": "pageSty rf"}) if info == None: return 1 alist = info.find_all("a") try: page = int(alist[1].get_text().strip()) except: page = 1 return page
def get_community_perregion(regionname=u'pudong'): url = BASE_URL + u"xiaoqu/" + regionname + "/" source_code = misc.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') if check_block(soup): return total_pages = misc.get_sh_total_pages(url) if total_pages == None: row = model.Community.select().count() raise RuntimeError("Finish at %s because total_pages is None" % row) for page in range(total_pages): if page > 0: url_page = BASE_URL + u"xiaoqu/" + regionname + "/d%d/" % page source_code = misc.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') nameList = soup.findAll("div", {"class": "info-panel"}) i = 0 log_progress("GetCommunityByRegionlist", regionname, page + 1, total_pages) data_source = [] for name in nameList: # Per house loop i = i + 1 info_dict = {} try: communitytitle = name.find("a", {"name": "selectDetail"}) title = communitytitle.get_text().strip('\n') link = communitytitle.get('href') id = communitytitle.get('key') info_dict.update({u'title': title}) info_dict.update({u'link': link}) info_dict.update({u'id': id}) district = name.find("a", {"class": "ad"}) info_dict.update({u'district': district.get_text()}) cons = name.find("div", {"class": "con"}) bizcircle = cons.findAll("a") info_dict.update( {u'bizcircle': bizcircle[1].get_text().strip()}) try: tagList = name.find("span", {"class": "fang-subway-ex"}) info_dict.update({u'tagList': tagList.get_text().strip()}) except: info_dict.update({u'tagList': ''}) onsale = name.find("span", {"class": "num"}) info_dict.update({u'onsale': onsale.get_text().strip()}) price = name.find("div", {"class": "price"}) info_dict.update({u'price': price.span.get_text().strip()}) communityinfo = get_communityinfo_by_url(link) for key, value in communityinfo.iteritems(): info_dict.update({key: value}) except: continue # communityinfo insert into mysql data_source.append(info_dict) # model.Community.insert(**info_dict).upsert().execute() with model.database.atomic(): model.Community.insert_many(data_source).upsert().execute() time.sleep(1)
def GetSellByCommunitylist(): with open('community_id.txt') as f: for line in f.readlines(): data_source = [] code = line.split(' ')[0] communityinfo = line.split(' ')[1] pages = get_totalpage("https://bj.5i5j.com/sold/%s" % code) for page in range(1, pages + 1): source_code = misc.get_source_code( "https://bj.5i5j.com/sold/%s/n%d/" % (code, page)) soup = BeautifulSoup(source_code, 'lxml') content = soup.find('ul', class_="pList zu") try: lists = content.find_all('li') except: continue for each in lists: info_dict = {} sTit = each.find("p", {"class": "sTit"}) title = sTit.strong.get_text().strip() community = title.split(' ')[0] listCon = each.find("div", {"class": "listCon"}) plist = listCon.find_all("p") housetype = plist[1].get_text().strip().split(u'·')[0] square = plist[1].get_text().strip().split(u'·')[1] direction = plist[1].get_text().strip().split(u'·')[2] dealdate = plist[2].get_text().strip().split(u':')[1] jiage = each.find("div", {"class": "jiage"}) totalPrice = jiage.strong.get_text().strip() unitPrice = find_between_r( jiage.p.get_text().strip(), u'价', u'元') source = u"我爱我家" status = u"暂无信息" floor = u"暂无信息" years = u"暂无信息" link = "https://bj.5i5j.com%s" % each.a.get("href") houseID = "5i5j%s" % find_between_r( each.a.get("href"), '/', '.') info_dict.update({u'title': title}) info_dict.update({u'houseID': houseID}) info_dict.update({u'link': link}) info_dict.update({u'community': community}) info_dict.update({u'years': years}) info_dict.update({u'housetype': housetype}) info_dict.update({u'square': square}) info_dict.update({u'direction': direction}) info_dict.update({u'floor': floor}) info_dict.update({u'status': status}) info_dict.update({u'source': source}) info_dict.update({u'totalPrice': totalPrice}) info_dict.update({u'unitPrice': unitPrice}) info_dict.update({u'dealdate': dealdate}) data_source.append(info_dict) with model.database.atomic(): try: model.Sellinfo.insert_many(data_source).upsert().execute() except: pass logging.info("%s finish" % communityinfo) time.sleep(1)