def get_house_perregion(link): url = BASE_URL + link source_code = crawlcore.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') total_pages = crawlcore.get_total_pages(soup) if total_pages is None: raise RuntimeError('url:【%s】未找到total_pages' % url) for page in range(1, total_pages + 1): if page > 1: url_page = url + u"/pg%d/" % page source_code = crawlcore.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') logging.info("GetHouseByRegionlist district:%s page:%s totalpage:%s" % (link, page, total_pages)) data_source = [] hisprice_data_source = [] for ultag in soup.findAll("ul", {"class": "sellListContent"}): for name in ultag.find_all('li'): info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({'title': housetitle.get_text().strip()}) info_dict.update({'link': housetitle.a.get('href')}) houseID = housetitle.a.get('data-housecode') if houseID is None: houseID = housetitle.a.get( 'data-lj_action_housedel_id') info_dict.update({'houseID': houseID}) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('|') # logging.info('houseID: %s houseinfo: %s' % (houseID,info)) info_dict.update({'community': info[0]}) info_dict.update({'housetype': info[1]}) info_dict.update({'square': info[2]}) info_dict.update({'direction': info[3]}) info_dict.update({'decoration': info[4]}) housefloor = name.find("div", {"class": "positionInfo"}) info_dict.update({'years': housefloor.get_text().strip()}) info_dict.update({'floor': housefloor.get_text().strip()}) followInfo = name.find("div", {"class": "followInfo"}) info_dict.update( {'followInfo': followInfo.get_text().strip()}) taxfree = name.find("span", {"class": "taxfree"}) if taxfree == None: info_dict.update({"taxtype": ""}) else: info_dict.update( {"taxtype": taxfree.get_text().strip()}) totalPrice = name.find("div", {"class": "totalPrice"}) info_dict.update( {'totalPrice': totalPrice.span.get_text()}) unitPrice = name.find("div", {"class": "unitPrice"}) info_dict.update( {'unitPrice': unitPrice.get("data-price")}) info_dict.update({'version': setting.DB_VERSION}) except: continue # Houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"], 'version': setting.DB_VERSION }) with model.database.atomic(): model.Houseinfo.insert_many(data_source).execute() model.Hisprice.insert_many(hisprice_data_source).execute() time.sleep(1)
def get_community_from_default_all(thread_name, part_no): starttime = datetime.datetime.now() logging.info('[%s]get_community_from_default_all 获取来自所有列表的小区信息' % thread_name) url = BASE_URL + "xiaoqu/?from=rec" source_code = crawlcore.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') total_pages = crawlcore.get_total_pages(soup) i = 0 if total_pages is None: raise RuntimeError('url:【%s】未找到total_pages' % url) start_no = 0 end_no = 0 start_no = int(total_pages / ALL_THREAD_NUMBER) * part_no + 1 if part_no == ALL_THREAD_NUMBER - 1: end_no = total_pages + 1 else: end_no = int(total_pages / ALL_THREAD_NUMBER) * (part_no + 1) + 1 for page in range(start_no, end_no): if page > 1: sub_url = BASE_URL + "xiaoqu/pg%s" % page source_code = crawlcore.get_source_code(sub_url) soup = BeautifulSoup(source_code, 'lxml') name_list = soup.findAll('li', {'class': 'clear xiaoquListItem'}) logging.info( '[%s] get_community_from_default_all page:%s total_page:%s' % (thread_name, page, total_pages)) for name in name_list: info_dict = {} try: communitytitle = name.find("div", {"class": "title"}) title = communitytitle.get_text().strip('\n') if redis_server.sismember(REDIS_COMMUNITY_SET_NAME, title): continue i = i + 1 redis_server.sadd(REDIS_COMMUNITY_SET_NAME, title) link = communitytitle.a.get('href') info_dict.update({'title': title}) info_dict.update({'link': link}) district = name.find("a", {"class": "district"}) info_dict.update({'district': district.get_text()}) bizcircle = name.find("a", {"class": "bizcircle"}) info_dict.update({'bizcircle': bizcircle.get_text()}) tagList = name.find("div", {"class": "tagList"}) info_dict.update({'tagList': tagList.get_text().strip('\n')}) onsale = name.find("a", {"class": "totalSellCount"}) info_dict.update( {'onsale': onsale.span.get_text().strip('\n')}) onrent = name.find("a", {"title": title + u"租房"}) info_dict.update( {'onrent': onrent.get_text().strip('\n').split(u'套')[0]}) info_dict.update({'communityid': name.get('data-housecode')}) price = name.find("div", {"class": "totalPrice"}) info_dict.update({'price': price.span.get_text().strip('\n')}) info_dict.update({'version': setting.DB_VERSION}) communityinfo = get_communityinfo_by_url(link) for key, value in communityinfo.items(): info_dict.update({key: value}) with model.database.atomic(): model.Community.insert(info_dict).execute() time.sleep(1) except: continue endtime = datetime.datetime.now() redis_server.incr(REDIS_COMMUNITY_DONE_COUNT_KEY, 1) logging.info("[%s] " % thread_name + "get_community_from_default_all Run time: " + str(endtime - starttime) + " Total:" + str(i))
def get_house_perregion(thread_name, part_no, link): url = BASE_URL + link source_code = crawlcore.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') total_pages = crawlcore.get_total_pages(soup) if total_pages is None: raise RuntimeError('url:【%s】未找到total_pages' % url) start_no = 0 end_no = 0 start_no = int(total_pages / REGION_THREAD_PAGE_NUMBER) * part_no + 1 if part_no == REGION_THREAD_PAGE_NUMBER - 1: end_no = total_pages + 1 else: end_no = int( total_pages / REGION_THREAD_PAGE_NUMBER) * (part_no + 1) + 1 for page in range(start_no, end_no): if page > 1: url_page = url + u"/pg%d/" % page source_code = crawlcore.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') logging.info( "[%s]GetHouseByRegionlist district:%s page:%s totalpage:%s" % (thread_name, link, page, total_pages)) data_source = [] hisprice_data_source = [] for ultag in soup.findAll("ul", {"class": "sellListContent"}): for name in ultag.find_all('li'): info_dict = {} try: housetitle = name.find("div", {"class": "title"}) info_dict.update({'title': housetitle.get_text().strip()}) info_dict.update({'link': housetitle.a.get('href')}) houseID = housetitle.a.get('data-housecode') if houseID is None: houseID = housetitle.a.get( 'data-lj_action_housedel_id') info_dict.update({'houseID': houseID}) houseinfo = name.find("div", {"class": "houseInfo"}) info = houseinfo.get_text().split('|') # logging.info('houseID: %s houseinfo: %s' % (houseID,info)) info_dict.update({'community': info[0]}) info_dict.update({'housetype': info[1]}) info_dict.update({'square': info[2]}) info_dict.update({'direction': info[3]}) info_dict.update({'decoration': info[4]}) housefloor = name.find("div", {"class": "positionInfo"}) # info_dict.update({'years':housefloor.get_text().strip()}) # info_dict.update({'floor':housefloor.get_text().strip()}) ObjectForNone(housefloor, info_dict, 'years', 'get_text().strip()', '') ObjectForNone(housefloor, info_dict, 'floor', 'get_text().strip()', '') followInfo = name.find("div", {"class": "followInfo"}) # info_dict.update({'followInfo':followInfo.get_text().strip()}) ObjectForNone(followInfo, info_dict, 'followInfo', 'get_text().strip()', '') taxfree = name.find("span", {"class": "taxfree"}) # if taxfree == None: # info_dict.update({"taxtype":""}) # else: # info_dict.update({"taxtype":taxfree.get_text().strip()}) ObjectForNone(taxfree, info_dict, 'taxtype', 'get_text().strip()', '') totalPrice = name.find("div", {"class": "totalPrice"}) # info_dict.update({'totalPrice':totalPrice.span.get_text()}) ObjectForNone(totalPrice, info_dict, 'totalPrice', 'span.get_text()', '0') unitPrice = name.find("div", {"class": "unitPrice"}) # info_dict.update({'unitPrice':unitPrice.get("data-price")}) ObjectForNone(unitPrice, info_dict, 'unitPrice', 'get("data-price")', '0') info_dict.update({'version': setting.DB_VERSION}) except: continue # Houseinfo insert into mysql data_source.append(info_dict) hisprice_data_source.append({ "houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"], 'version': setting.DB_VERSION }) with model.database.atomic(): model.Houseinfo.insert_many(data_source).execute() model.Hisprice.insert_many(hisprice_data_source).execute() time.sleep(1) # 线程结束,更新线程计数器 redis_server.incr(REDIS_ERSHOUFANG_DONE_COUNT_KEY, 1)
def get_community_from_default_all(): starttime = datetime.datetime.now() logging.info('get_community_from_default_all 获取来自所有列表的小区信息') url = BASE_URL + "xiaoqu/?from=rec" source_code = crawlcore.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') total_pages = crawlcore.get_total_pages(soup) if total_pages is None: raise RuntimeError('url:【%s】未找到total_pages' % url) for page in range(1, total_pages + 1): if page > 1: sub_url = BASE_URL + "xiaoqu/pg%s" % page source_code = crawlcore.get_source_code(sub_url) soup = BeautifulSoup(source_code, 'lxml') name_list = soup.findAll('li', {'class': 'clear xiaoquListItem'}) logging.info('get_community_from_default_all page:%s total_page:%s' % (page, total_pages)) for name in name_list: info_dict = {} try: communitytitle = name.find("div", {"class": "title"}) title = communitytitle.get_text().strip('\n') if title in Community_Key_Pool: continue Community_Key_Pool.append(title) link = communitytitle.a.get('href') info_dict.update({'title': title}) info_dict.update({'link': link}) district = name.find("a", {"class": "district"}) info_dict.update({'district': district.get_text()}) bizcircle = name.find("a", {"class": "bizcircle"}) info_dict.update({'bizcircle': bizcircle.get_text()}) tagList = name.find("div", {"class": "tagList"}) info_dict.update({'tagList': tagList.get_text().strip('\n')}) onsale = name.find("a", {"class": "totalSellCount"}) info_dict.update( {'onsale': onsale.span.get_text().strip('\n')}) onrent = name.find("a", {"title": title + u"租房"}) info_dict.update( {'onrent': onrent.get_text().strip('\n').split(u'套')[0]}) info_dict.update({'communityid': name.get('data-housecode')}) price = name.find("div", {"class": "totalPrice"}) info_dict.update({'price': price.span.get_text().strip('\n')}) info_dict.update({'version': setting.DB_VERSION}) communityinfo = get_communityinfo_by_url(link) for key, value in communityinfo.items(): info_dict.update({key: value}) with model.database.atomic(): model.Community.insert(info_dict).execute() time.sleep(1) except: continue endtime = datetime.datetime.now() logging.info("get_community_from_default_all Run time: " + str(endtime - starttime) + " Total:" + len(Community_Key_Pool))
def get_rent_perregion(link): url = BASE_URL + link source_code = crawlcore.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') total_pages = crawlcore.get_total_pages(soup) if total_pages is None: raise RuntimeError('url:【%s】未找到total_pages' % url) for page in range(1, total_pages + 1): if page > 1: url_page = url + u"/pg%d/" % page source_code = crawlcore.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') logging.info("GetRentByRegionlist district:%s page:%s totalpage:%s" % (link, page, total_pages)) data_source = [] for ultag in soup.findAll("ul", {"class": "house-lst"}): for name in ultag.find_all('li'): info_dict = {} try: housetitle = name.find("div", {"class": "info-panel"}) info_dict.update( {'title': housetitle.h2.a.get_text().strip()}) info_dict.update({'link': housetitle.a.get("href")}) houseID = name.get("data-housecode") info_dict.update({'houseID': houseID}) region = name.find("span", {"class": "region"}) info_dict.update({'region': region.get_text().strip()}) zone = name.find("span", {"class": "zone"}) info_dict.update({'zone': zone.get_text().strip()}) meters = name.find("span", {"class": "meters"}) info_dict.update({'meters': meters.get_text().strip()}) other = name.find("div", {"class": "con"}) info_dict.update({'other': other.get_text().strip()}) subway = name.find("span", {"class": "fang-subway-ex"}) if subway is None: info_dict.update({'subway': ""}) else: info_dict.update( {'subway': subway.span.get_text().strip()}) decoration = name.find("span", {"class": "decoration-ex"}) if decoration is None: info_dict.update({'decoration': ""}) else: info_dict.update( {'decoration': decoration.span.get_text().strip()}) heating = name.find("span", {"class": "heating-ex"}) if heating is None: info_dict.update({'heating': ""}) else: info_dict.update( {'heating': heating.span.get_text().strip()}) price = name.find("div", {"class": "price"}) info_dict.update( {'price': int(float(price.span.get_text().strip()))}) pricepre = name.find("div", {"class": "price-pre"}) info_dict.update({'pricepre': pricepre.get_text().strip()}) info_dict.update({'version': setting.DB_VERSION}) except: continue # Rentinfo insert into mysql data_source.append(info_dict) with model.database.atomic(): model.Rentinfo.insert_many(data_source).execute() time.sleep(1)
def get_rent_perregion(thread_name, part_no, link): url = BASE_URL + link source_code = crawlcore.get_source_code(url) soup = BeautifulSoup(source_code, 'lxml') total_pages = crawlcore.get_total_pages(soup) if total_pages is None: raise RuntimeError('url:【%s】未找到total_pages' % url) start_no = 0 end_no = 0 start_no = int(total_pages / REGION_THREAD_PAGE_NUMBER) * part_no + 1 if part_no == REGION_THREAD_PAGE_NUMBER - 1: end_no = total_pages + 1 else: end_no = int( total_pages / REGION_THREAD_PAGE_NUMBER) * (part_no + 1) + 1 for page in range(start_no, end_no): if page > 1: url_page = url + u"/pg%d/" % page source_code = crawlcore.get_source_code(url_page) soup = BeautifulSoup(source_code, 'lxml') logging.info( "[%s]GetRentByRegionlist district:%s page:%s totalpage:%s" % (thread_name, link, page, total_pages)) data_source = [] for ultag in soup.findAll("ul", {"class": "house-lst"}): for name in ultag.find_all('li'): info_dict = {} try: housetitle = name.find("div", {"class": "info-panel"}) info_dict.update( {'title': housetitle.h2.a.get_text().strip()}) info_dict.update({'link': housetitle.a.get("href")}) houseID = name.get("data-housecode") info_dict.update({'houseID': houseID}) region = name.find("span", {"class": "region"}) # info_dict.update({'region':region.get_text().strip()}) ObjectForNone(region, info_dict, 'region', 'get_text().strip()', '') zone = name.find("span", {"class": "zone"}) # info_dict.update({'zone':zone.get_text().strip()}) ObjectForNone(zone, info_dict, 'zone', 'get_text().strip()', '') meters = name.find("span", {"class": "meters"}) # info_dict.update({'meters':meters.get_text().strip()}) ObjectForNone(meters, info_dict, 'meters', 'get_text().strip()', '') other = name.find("div", {"class": "con"}) # info_dict.update({'other':other.get_text().strip()}) ObjectForNone(other, info_dict, 'other', 'get_text().strip()', '') subway = name.find("span", {"class": "fang-subway-ex"}) if subway is None: info_dict.update({'subway': ""}) else: info_dict.update( {'subway': subway.span.get_text().strip()}) decoration = name.find("span", {"class": "decoration-ex"}) if decoration is None: info_dict.update({'decoration': ""}) else: info_dict.update( {'decoration': decoration.span.get_text().strip()}) heating = name.find("span", {"class": "heating-ex"}) if heating is None: info_dict.update({'heating': ""}) else: info_dict.update( {'heating': heating.span.get_text().strip()}) price = name.find("div", {"class": "price"}) info_dict.update( {'price': int(float(price.span.get_text().strip()))}) pricepre = name.find("div", {"class": "price-pre"}) #info_dict.update({'pricepre':pricepre.get_text().strip()}) ObjectForNone(pricepre, info_dict, 'pricepre', 'get_text().strip()', '0') info_dict.update({'version': setting.DB_VERSION}) except: continue # Rentinfo insert into mysql data_source.append(info_dict) with model.database.atomic(): model.Rentinfo.insert_many(data_source).execute() time.sleep(1) # 线程结束,更新线程计数器 redis_server.incr(REDIS_RENT_DONE_COUNT_KEY, 1)