def parseHouse(self, url): response = self.downloader.download_html_response(url) if response is None: return True try: notExist = response.xpath('//h1/text()').extract()[0].strip() if re.search(r'不在这个星球上', notExist): print('%s 58网页不存在 %s' % (Utils.getCurrentTime(), url)) Utils.write_error('58网页不存在 %s' % url) return True else: pass except: pass try: if re.search(r'firewall', response.url): self.runSpider = False return False except: pass if response.status == 404: print('%s 58网页不存在 %s' % (Utils.getCurrentTime(), url)) Utils.write_error('58网页不存在 %s' % url) return house = House() house.url = self.getMobileUrl(response) house.title = self.getTitle(response) house.image_url = self.getImageUrl(response) house.city = self.getCity(response) house.district = self.getDistrict(response) house.rental = self.getRental(response) house.campus = self.getCampus(response) house.date = self.getDate(response) house.address = self.getAddress(response) house.source = self.source house.house_type = self.getHouseType(response) house.rooms = self.getRooms(house.house_type) house.area = self.getArea(response) house.floor = self.getFloor(response) house.contact = self.getContact(response) house.phone = self.getPhone(response) house.rent_type = self.getRentType(response) house.lat, house.lon = self.getLatLon(house) house.md5 = Utils.generateMD5(house.url) house.time = Utils.getCurrentTime() timedelta = datetime.datetime.now() - datetime.datetime.strptime( house.date, '%Y-%m-%d') days = timedelta.days if days > 7: print('%s 58过期房源:%s;%s;%s' % (Utils.getCurrentTime(), house.date, house.title, house.url)) elif house.isValidHouse(): DB_Manager(house) return True #继续运行这个爬虫
def parseHouse(self): if not self.https: response = self.downloader.download_html_response(self.url) else: response = self.downloader.download_https_response(self.url) if self.isInvalidPage(response): return if self.isMeetFireWall(response): self.runSpider = False redis = RedisUtils() redis.add_to_redis(self.name, self.url) return if response.status == 404: print('%s>%s网页不存在 %s' % (Utils.getCurrentTime(), self.source, self.url)) Utils.write_error('%s>%s网页不存在 %s' % (Utils.getCurrentTime(), self.source, self.url)) return house = House() house.url = self.getMobileUrl(response) house.title = self.getTitle(response) house.image_url = self.getImageUrl(response) house.city = self.getCity(response) house.district = self.getDistrict(response) house.rental = self.getRental(response) house.campus = self.getCampus(response) house.date = self.getDate(response) house.address = self.getAddress(response) house.source = self.source house.house_type = self.getHouseType(response) house.rooms = self.getRooms(house.house_type) house.area = self.getArea(response) house.floor = self.getFloor(response) house.contact = self.getContact(response) house.phone = self.getPhone(response) house.rent_type = self.getRentType(response) house.lat, house.lon = self.getLatLon(house) house.md5 = Utils.generateMD5(house.url) house.time = Utils.getCurrentTime() timedelta = datetime.datetime.now() - datetime.datetime.strptime( house.date, '%Y-%m-%d') days = timedelta.days if days > 10: print('%s 58过期房源:%s;%s;%s' % (Utils.getCurrentTime(), house.date, house.title, house.url)) return DB_Manager(house)
def parseHouse(self, url): response = self.downloader.download_html_response(url) if response is None: Utils.write_error('赶集网下载网页失败 %s' % url) print('%s 赶集网下载网页失败%s' % (Utils.getCurrentTime(), url)) return True if self.checkVeriCode(response): Utils.write_error('赶集网遇到验证码') print('%s 赶集网遇到验证码' % Utils.getCurrentTime()) self.runSpider = False return False if response.status == 404: Utils.write_error('赶集网网页不存在 %s' % url) print('%s 赶集网网页不存在 %s' % (Utils.getCurrentTime(), url)) return True house = House() house.url = self.getMobileUrl(response) print(house.url) house.title = self.getTitle(response) house.image_url = self.getImageUrl(response) house.city = self.getCity(response) house.district = self.getDistrict(response) house.rental = self.getRental(response) house.campus = self.getCampus(response) house.date = self.getDate(response) house.source = self.source house.house_type = self.getHouseType(response) house.rooms = self.getRooms(house.house_type) house.area = self.getArea(response) house.floor = self.getFloor(response) house.contact = self.getContact(response) house.phone = self.getPhone(response) house.rent_type = self.getRentType(response) house.lat, house.lon, house.address = self.getLatLon(house) house.md5 = Utils.generateMD5(house.url) house.time = Utils.getCurrentTime() timedelta = datetime.datetime.now() - datetime.datetime.strptime( house.date, '%Y-%m-%d') days = timedelta.days if days > 7: print('%s 赶集网过期房源:%s;%s;%s' % (Utils.getCurrentTime(), house.date, house.title, house.url)) elif house.isValidHouse(): DB_Manager(house) return True
def parseHouse(self, url): response = self.downloader.download_https_response(url) if response is None: return True if re.search(r'访问验证', response.body.decode('utf-8')): self.runSpider = False self.redis.add_to_redis("SpiderAnjuke", response.url) return False if response.status == 404: print('%s 安居客网页不存在 %s' % (Utils.getCurrentTime(), url)) Utils.write_error('安居客网页不存在 %s' % url) return True house = House() house.url = self.getMobileUrl(response) house.title = self.getTitle(response) house.image_url = self.getImageUrl(response) house.city = self.getCity(response) house.district = self.getDistrict(response) house.rental = self.getRental(response) house.campus = self.getCampus(response) house.date = self.getDate(response) house.address = self.getAddress(response) house.source = self.source house.house_type = self.getHouseType(response) house.rooms = self.getRooms(house.house_type) house.area = self.getArea(response) house.floor = self.getFloor(response) house.contact = self.getContact(response) house.phone = self.getPhone(response) house.rent_type = self.getRentType(response) house.lat, house.lon, house.address = self.getLatLon(house) house.md5 = Utils.generateMD5(house.url) house.time = Utils.getCurrentTime() timedelta = datetime.datetime.now() - datetime.datetime.strptime( house.date, '%Y-%m-%d') days = timedelta.days if days > 7: print('%s 安居客过期房源:%s;%s;%s' % (Utils.getCurrentTime(), house.date, house.title, house.url)) elif house.isValidHouse(): DB_Manager(house) return True
def write_house(self, house): title = house.title campus = house['campus'] url = house['url'] image_url = house['image_url'] rental = house['rental'] if type(rental) == type(0.02): rental = int(rental) area = house['area'] house_type = house['house_type'] source = house['source'] date = house['date'] rent_type = house['rent_type'] floor = house['floor'] address = house['address'] district = house['district'] district = self.check_guangzhou_district(district) city = house['city'] lat = house['lat'] lon = house['lon'] md5 = Utils.generateMD5(url) try: rooms = house['rooms'] except: rooms = Utils.get_rooms(house_type) time = Utils.getCurrentTime() import datetime timedelta = datetime.datetime.now() - datetime.datetime.strptime( date, '%Y-%m-%d') days = timedelta.days if days > self.expire_days: return table = self.get_city_pinyin_table(city) sql = "insert into " + table + "(rental, title, campus, house_type, date, rent_type, area, floor, district, address, url,image_url, source, city, lat, lon, time, rooms, md5) values( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update time = %s, date = %s, rental = %s" self.cursor.execute( sql, (rental, title, campus, house_type, date, rent_type, area, floor, district, address, url, image_url, source, city, lat, lon, time, rooms, md5, time, date, rental)) self.conn.commit() print(Utils.getCurrentTime(), '在云端数据库中写入新房源:' + city, campus, url)
def sadd(self, name, url): key = 'HouseMasterSpider:%s_filter' % name value = Utils.generateMD5(url) return self.conn.sadd(key, value)