def crawGanji(url): myLogger.logger.info("\n\n===========来自赶集网的房源==============") r = requests.get(url) contents = BeautifulSoup(r.content, "html.parser") # 获取到该网页的地址 all = contents.find_all("div", class_="ershoufang-list") global second second = all[0].find("dd", class_="title").find("a").text infoList = [] for e in all: ele = e.find("dd", class_="title").find("a") # 房子赶集描述 description = deleteallspecialcharacters(ele.text) # 赶集网改版后url后面增加了很多字符串,这里取有效字符串 url = getvalidhref(ele["href"]) price = e.find("dd", class_="info").find("div", class_="price").find( "span", class_="js-price").text location = deleteallspecialcharacters( e.find("dd", class_="address").find("span", class_="area").text).replace( "徐东二手房出售", "") myLogger.logger.info(description + " ,url: " + url + "价格: " + price + " 位置:" + location) info = Info(description=description, url=url, price=price, website_id=2, time=currentTime(), location=location) infoList.append(info) batchInsertInfo(infoList)
def crawl58(url): """ :param url: 58同城爬取的url :return: NULL 该文件需要进一步解决的问题: 一、优化数据是否刷新算法(针对搜房网与赶集网刷新问题已经解决) 二、重新整理MySQL数据库存储格式(已解决) 三、对数据库中缺少的字段重新添加(已解决) 四、及时处理网站更新问题,在这方面如何做一个动态配置 五、优化匹配算法,现在这种全局匹配方式太落后,准确度几乎为0(已优化) """ myLogger.logger.info("\n\n===========来自58的房源==============") r = requests.get(url) contents = BeautifulSoup(r.content, "html.parser") # 获取到该网页的地址 tds = contents.find_all("td", class_="t") # 找到所有class="t"的td元素 global first first = tds[0].find("a").text infoList = [] for e in tds: a = e.find("a") desc = deleteallspecialcharacters(a.text) left = e.find("div", class_="qj-listleft") position = left.a.text myLogger.logger.info("URL:" + a["href"] + "描述: " + desc) right = e.find("div", class_="qj-listright") price = right.b.text dealingtime = currentTime() # 房源发布时间这里要删除所有非数字字符 sourcetime = re.sub( "[^0-9]", "", deleteallspecialcharacters( left.find("span", class_="qj-listjjr").text)) # 有些人的房价写的是一个区间,针对这种信息现不错处理,全部归为0 try: price = int(price) except: price = 0 myLogger.logger.info("价格:" + right.b.text) info = Info(description=desc, url=deletedomain(a["href"]), price=price, website_id=1, time=dealingtime, sourcetime=sourcetime, location=position) # 为什么数据库只是执行一条数据的插入???这里暂且先放到一个list集合中 infoList.append(info) batchInsertInfo(infoList)
def crawyifang(url): r = requests.get(url) contents = BeautifulSoup(r.content, "html.parser") allhouses = contents.find_all("div", class_="ohcon-list") global first first = deleteallspecialcharacters(allhouses[0].find("div", class_="ohclist-ctx").find("a").text) yifanginfolist = list() for h in allhouses: basic = h.find("div", class_="ohclist-ctx").find("a") description = deleteallspecialcharacters(basic.text) url = basic["href"] location = h.find("div", class_="ohclist-ctx").find("p", class_="addr").text sourcetime = deleteallspecialcharacters(h.find("div", class_="ohclist-ctx").find("span", class_="time").text) myLogger.logger.info("description: "+ description + "\turl: "+ url + "\tlocation: " + location +"\ttime: " + sourcetime) price = h.find("div", class_="ohclist-price").find("span", class_="pnum").text.replace("万", "") # 当前时间点 currenttime = datetime.now() info = Info(description=description, url=deletedomain(url), price=price, website_id=4, time=currenttime, sourcetime=sourcetime, location=location) yifanginfolist.append(info) # 批量倒入info实体 batchInsertInfo(yifanginfolist)
def crawSouFang(url): myLogger.logger.info("\n\n===========来自搜房网的房源==============") r = requests.get(url) contents = BeautifulSoup(r.content, "html.parser") # 获取到该网页的地址 parent = contents.find("div", class_="build_list") # 推广房源 commercial = parent.find("div", class_="backColor") # 所有房源具体信息 allCommercialHouses = commercial.find_all("dl") # 查找div下直接dl子元素 allNonCommercialHouses = parent.find_all("dl", recursive=False) global third infoList = [] # 将third定义为一个tuple类型,判断它的推广房源以及非推广房源 third_one = deleteallspecialcharacters(allCommercialHouses[0].find( "dd", class_="margin_l").find("p", class_="build_name").find("a").text) third_two = deleteallspecialcharacters(allNonCommercialHouses[0].find( "dd", class_="margin_l").find("p", class_="build_name").find("a").text) third = [third_one, third_two] myLogger.logger.info("推广房源数量:" + str(len(allCommercialHouses)) + "个人非推广房源数目:" + str(len(allNonCommercialHouses))) myLogger.logger.info("\t\t=========下面是所有搜房网推广个人房源=============") # 下面时所有的推广的房源 for a in allCommercialHouses: basic = a.find("dd", class_="margin_l").find("p", class_="build_name").find("a") description = deleteallspecialcharacters(basic.text) # 这里getDomain还未完善 url = basic["href"] price = a.find("dd", class_="right price_r").find( "p", class_="build_price").find("span").text position = deleteallspecialcharacters( a.find("dd", class_="margin_l").find("p", class_="finish_data").text) myLogger.logger.info(description + " , url :" + url + " ,price :" + price + " ,位置 :" + position) info = Info(description=description, url=url, price=price, website_id=3, time=currentTime(), location=position) infoList.append(info) myLogger.logger.info("\t\t=========下面是所有非推广个人房源=============") # 下面时所有非推广的个人房源 for a in allNonCommercialHouses: basic = a.find("dd", class_="margin_l").find("p", class_="build_name").find("a") description = deleteallspecialcharacters(basic.text) position = deleteallspecialcharacters( a.find("dd", class_="margin_l").find("p", class_="finish_data").text) # 这里getDomain还未完善 url = basic["href"] price = a.find("dd", class_="right price_r").find( "p", class_="build_price").find("span").text myLogger.logger.info(description + " , url :" + url + " ,price :" + price + " ,location :" + position) info = Info(description=description, url=url, price=price, website_id=3, time=currentTime(), location=position) infoList.append(info) batchInsertInfo(infoList)