Esempio n. 1
0
    def parseHouse(self, url):
        response = self.downloader.download_html_response(url)
        if response is None:
            return True
        try:
            notExist = response.xpath('//h1/text()').extract()[0].strip()
            if re.search(r'不在这个星球上', notExist):
                print('%s 58网页不存在 %s' % (Utils.getCurrentTime(), url))
                Utils.write_error('58网页不存在 %s' % url)
                return True
            else:
                pass
        except:
            pass

        try:
            if re.search(r'firewall', response.url):
                self.runSpider = False
                return False
        except:
            pass

        if response.status == 404:
            print('%s 58网页不存在 %s' % (Utils.getCurrentTime(), url))
            Utils.write_error('58网页不存在 %s' % url)
            return

        house = House()
        house.url = self.getMobileUrl(response)
        house.title = self.getTitle(response)
        house.image_url = self.getImageUrl(response)
        house.city = self.getCity(response)
        house.district = self.getDistrict(response)
        house.rental = self.getRental(response)
        house.campus = self.getCampus(response)
        house.date = self.getDate(response)
        house.address = self.getAddress(response)
        house.source = self.source
        house.house_type = self.getHouseType(response)
        house.rooms = self.getRooms(house.house_type)
        house.area = self.getArea(response)
        house.floor = self.getFloor(response)
        house.contact = self.getContact(response)
        house.phone = self.getPhone(response)
        house.rent_type = self.getRentType(response)
        house.lat, house.lon = self.getLatLon(house)
        house.md5 = Utils.generateMD5(house.url)
        house.time = Utils.getCurrentTime()

        timedelta = datetime.datetime.now() - datetime.datetime.strptime(
            house.date, '%Y-%m-%d')
        days = timedelta.days
        if days > 7:
            print('%s 58过期房源:%s;%s;%s' %
                  (Utils.getCurrentTime(), house.date, house.title, house.url))
        elif house.isValidHouse():
            DB_Manager(house)
        return True  #继续运行这个爬虫
Esempio n. 2
0
    def parseHouse(self):
        if not self.https:
            response = self.downloader.download_html_response(self.url)
        else:
            response = self.downloader.download_https_response(self.url)
        if self.isInvalidPage(response):
            return

        if self.isMeetFireWall(response):
            self.runSpider = False
            redis = RedisUtils()
            redis.add_to_redis(self.name, self.url)
            return

        if response.status == 404:
            print('%s>%s网页不存在 %s' %
                  (Utils.getCurrentTime(), self.source, self.url))
            Utils.write_error('%s>%s网页不存在 %s' %
                              (Utils.getCurrentTime(), self.source, self.url))
            return

        house = House()
        house.url = self.getMobileUrl(response)
        house.title = self.getTitle(response)
        house.image_url = self.getImageUrl(response)
        house.city = self.getCity(response)
        house.district = self.getDistrict(response)
        house.rental = self.getRental(response)
        house.campus = self.getCampus(response)
        house.date = self.getDate(response)
        house.address = self.getAddress(response)
        house.source = self.source
        house.house_type = self.getHouseType(response)
        house.rooms = self.getRooms(house.house_type)
        house.area = self.getArea(response)
        house.floor = self.getFloor(response)
        house.contact = self.getContact(response)
        house.phone = self.getPhone(response)
        house.rent_type = self.getRentType(response)
        house.lat, house.lon = self.getLatLon(house)
        house.md5 = Utils.generateMD5(house.url)
        house.time = Utils.getCurrentTime()

        timedelta = datetime.datetime.now() - datetime.datetime.strptime(
            house.date, '%Y-%m-%d')
        days = timedelta.days
        if days > 10:
            print('%s 58过期房源:%s;%s;%s' %
                  (Utils.getCurrentTime(), house.date, house.title, house.url))
            return

        DB_Manager(house)
    def parseHouse(self, url):
        response = self.downloader.download_html_response(url)
        if response is None:
            Utils.write_error('赶集网下载网页失败 %s' % url)
            print('%s 赶集网下载网页失败%s' % (Utils.getCurrentTime(), url))
            return True

        if self.checkVeriCode(response):
            Utils.write_error('赶集网遇到验证码')
            print('%s 赶集网遇到验证码' % Utils.getCurrentTime())
            self.runSpider = False
            return False

        if response.status == 404:
            Utils.write_error('赶集网网页不存在 %s' % url)
            print('%s 赶集网网页不存在 %s' % (Utils.getCurrentTime(), url))
            return True

        house = House()
        house.url = self.getMobileUrl(response)
        print(house.url)
        house.title = self.getTitle(response)
        house.image_url = self.getImageUrl(response)
        house.city = self.getCity(response)
        house.district = self.getDistrict(response)
        house.rental = self.getRental(response)
        house.campus = self.getCampus(response)
        house.date = self.getDate(response)
        house.source = self.source
        house.house_type = self.getHouseType(response)
        house.rooms = self.getRooms(house.house_type)
        house.area = self.getArea(response)
        house.floor = self.getFloor(response)
        house.contact = self.getContact(response)
        house.phone = self.getPhone(response)
        house.rent_type = self.getRentType(response)
        house.lat, house.lon, house.address = self.getLatLon(house)
        house.md5 = Utils.generateMD5(house.url)
        house.time = Utils.getCurrentTime()

        timedelta = datetime.datetime.now() - datetime.datetime.strptime(
            house.date, '%Y-%m-%d')
        days = timedelta.days
        if days > 7:
            print('%s 赶集网过期房源:%s;%s;%s' %
                  (Utils.getCurrentTime(), house.date, house.title, house.url))
        elif house.isValidHouse():
            DB_Manager(house)
        return True
    def parseHouse(self, url):
        response = self.downloader.download_https_response(url)
        if response is None:
            return True

        if re.search(r'访问验证', response.body.decode('utf-8')):
            self.runSpider = False
            self.redis.add_to_redis("SpiderAnjuke", response.url)
            return False

        if response.status == 404:
            print('%s 安居客网页不存在 %s' % (Utils.getCurrentTime(), url))
            Utils.write_error('安居客网页不存在 %s' % url)
            return True
        house = House()
        house.url = self.getMobileUrl(response)
        house.title = self.getTitle(response)
        house.image_url = self.getImageUrl(response)
        house.city = self.getCity(response)
        house.district = self.getDistrict(response)
        house.rental = self.getRental(response)
        house.campus = self.getCampus(response)
        house.date = self.getDate(response)
        house.address = self.getAddress(response)
        house.source = self.source
        house.house_type = self.getHouseType(response)
        house.rooms = self.getRooms(house.house_type)
        house.area = self.getArea(response)
        house.floor = self.getFloor(response)
        house.contact = self.getContact(response)
        house.phone = self.getPhone(response)
        house.rent_type = self.getRentType(response)
        house.lat, house.lon, house.address = self.getLatLon(house)
        house.md5 = Utils.generateMD5(house.url)
        house.time = Utils.getCurrentTime()

        timedelta = datetime.datetime.now() - datetime.datetime.strptime(
            house.date, '%Y-%m-%d')
        days = timedelta.days
        if days > 7:
            print('%s 安居客过期房源:%s;%s;%s' %
                  (Utils.getCurrentTime(), house.date, house.title, house.url))
        elif house.isValidHouse():
            DB_Manager(house)
        return True
Esempio n. 5
0
 def write_house(self, house):
     title = house.title
     campus = house['campus']
     url = house['url']
     image_url = house['image_url']
     rental = house['rental']
     if type(rental) == type(0.02):
         rental = int(rental)
     area = house['area']
     house_type = house['house_type']
     source = house['source']
     date = house['date']
     rent_type = house['rent_type']
     floor = house['floor']
     address = house['address']
     district = house['district']
     district = self.check_guangzhou_district(district)
     city = house['city']
     lat = house['lat']
     lon = house['lon']
     md5 = Utils.generateMD5(url)
     try:
         rooms = house['rooms']
     except:
         rooms = Utils.get_rooms(house_type)
     time = Utils.getCurrentTime()
     import datetime
     timedelta = datetime.datetime.now() - datetime.datetime.strptime(
         date, '%Y-%m-%d')
     days = timedelta.days
     if days > self.expire_days:
         return
     table = self.get_city_pinyin_table(city)
     sql = "insert into " + table + "(rental, title, campus, house_type, date, rent_type, area, floor, district, address, url,image_url, source, city, lat, lon, time, rooms, md5) values( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update time = %s, date = %s, rental = %s"
     self.cursor.execute(
         sql, (rental, title, campus, house_type, date, rent_type, area,
               floor, district, address, url, image_url, source, city, lat,
               lon, time, rooms, md5, time, date, rental))
     self.conn.commit()
     print(Utils.getCurrentTime(), '在云端数据库中写入新房源:' + city, campus, url)
 def sadd(self, name, url):
     key = 'HouseMasterSpider:%s_filter' % name
     value = Utils.generateMD5(url)
     return self.conn.sadd(key, value)