コード例 #1
0
ファイル: liferealty.py プロジェクト: olegarch/hodim
    def parse_item_page(self, response):
        #filename = 'liferealty' + response.url[-10:].strip('/') + '.html'
        #with open(filename, 'wb') as f:
        #    f.write(response.body)

        l = ApartmentLoader(Apartment(), response)
        l.add_value('url', response.url)
                
        type = response.xpath('//div[@id="list_sale"]/div[@class="fav"]/following-sibling::h1/text()').extract()[0]
        if u'гостинка' in type.lower():
            rooms = 0
        elif u'комната' in type.lower():
            rooms = 0
        elif u'однокомн' in type.lower():
            rooms = 1
        elif u'двухком' in type.lower():
            rooms = 2
        elif u'трехкомн' in type.lower():
            rooms = 3
        elif u'четырехко' in type.lower():
            rooms = 4
        elif u'пятикомнатна' in type.lower():
            rooms = 5
        else:
            print "ERROR type", type.encode('utf-8')
            assert False
        l.add_value('rooms',rooms)

        l.add_xpath('m2',u'//div[@id="list_sale"]/div[@class="card_block"]/p[contains(.,"Общая площадь")]/text()', re=u'Общая площадь: (\d*\.\d+|\d+) м')
        l.add_xpath('m2',u'//div[@id="list_sale"]/div[@class="card_block"]/p[contains(.,"Площадь")]/text()', re=u'Площадь: (\d*\.\d+|\d+)/(?:\d*\.\d+|\d+)/(?:\d*\.\d+|\d+) м')
        l.add_xpath('kitchenm2',u'//div[@id="list_sale"]/div[@class="card_block"]/p[contains(.,"Площадь")]/text()', re=u'Площадь: (?:\d*\.\d+|\d+)/(?:\d*\.\d+|\d+)/(\d*\.\d+|\d+) м')
        l.add_xpath('restm2',u'//div[@id="list_sale"]/div[@class="card_block"]/p[contains(.,"Площадь")]/text()', re=u'Площадь: (?:\d*\.\d+|\d+)/(\d*\.\d+|\d+)/(?:\d*\.\d+|\d+) м')
        
        l.add_xpath('floor',u'//div[@id="list_sale"]/div[@class="card_block"]/p[contains(text(),"Этаж")]/text()', re=u'Этаж: (\d+)/\d+')
        l.add_xpath('totfloors',u'//div[@id="list_sale"]/div[@class="card_block"]/p[contains(text(),"Этаж")]/text()', re=u'Этаж: \d+/(\d+)')
        
        l.add_xpath('city', u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Местонахождение")]/text()', re=u'Населенный пункт:\s+([\w-]+)')
        l.add_xpath('district', u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Местонахождение")]/text()', re=u'Район:\s+(\w+)')
        l.add_xpath('street', u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Местонахождение")]/text()', re=u'Адрес:\s+(.*)')
        
        l.add_xpath('rennovation',u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Параметры")]/descendant-or-self::*/text()',re=u'Состояние помещения:\s+(.*)')
        l.add_xpath('walls',u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Параметры")]/descendant-or-self::*/text()',re=u'Тип дома:\s+(.*)')
        l.add_xpath('balcony',u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Параметры")]/descendant-or-self::*/text()',re=u'Балкон:\s+(.*)')
        l.add_xpath('wc',u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Параметры")]/descendant-or-self::*/text()',re=u'Санузел:\s+(.*)')
        
        l.add_xpath('description', u'//div[@id="list_sale"]/div[@class="card_block" and contains(.,"Дополнительная")]/descendant-or-self::*/text()')
        
        l.add_value('updated', datetime.utcnow().isoformat())
        l.add_xpath('postDate',u'//div[@id="list_sale"]//div[@class="card_date" and contains(.,"добавлено")]/descendant-or-self::*/text()',re=u'добавлено\s+(.*)')
        
        price = ''.join(response.xpath(u'//div[@id="list_sale"]//div[@class="card_price" and contains(.,"руб")]/text()').extract())
        self.logger.debug("1. price "+str(price))
        price = ''.join(price)
        self.logger.debug("2. price "+price)
        price = price.replace(',','.')
        self.logger.debug("3. price "+price)
        price = str(int(float(price)*1000))
        self.logger.debug("4. price "+price)
        l.add_value('price',price)
        
        yield l.load_item()
コード例 #2
0
ファイル: irr.py プロジェクト: olegarch/hodim
    def parse_item_page(self, response):

        #filename = 'irr' + response.url[-10:] + '.html'
        #self.logger.info('filename %s',filename)
        #with open(filename, 'wb') as f:
        #    f.write(response.body)
    
        l = ApartmentLoader(Apartment(), response)
        l.add_value('url', response.url)
        #l.add_xpath('description', '//div[@class="advertDescriptionText"]/text()')
        l.add_xpath('description', '//meta[@name="description"]/@content')
        l.add_xpath('street', '//i[contains(@class,"irri-map")]/following-sibling::span/text()')
        l.add_xpath('street', '//i[contains(@class,"icon_spot")]/following-sibling::div/text()')
        l.add_xpath('price', '//div[contains(@class,"productPagePrice")]/text()')
        #l.add_value('city', u'Ростов-на-Дону')
        l.add_value('updated', datetime.utcnow().isoformat())
        l.add_xpath('postDate', '//div[@class="advertHeader"]/div[@class="createDate"]/text()')
        l.add_xpath('postDate', '//div[@class="productPage_headerColumn"]/div[@class="productPage__createDate"]/text()')

        # properties
        l.add_xpath('m2', self.extract_property_string(u"Общая площадь:"))
        l.add_xpath('kitchenm2', self.extract_property_string(u"Площадь кухни:"))
        l.add_xpath('restm2', self.extract_property_string(u"Жилая площадь:"))
        
        l.add_xpath('floor', self.extract_property_string(u"Этаж:"))
        l.add_xpath('totfloors', self.extract_property_string(u"Этажей в здании:"))
        l.add_xpath('rooms', self.extract_property_string(u"Комнат в квартире:"))
        
        l.add_xpath('district', self.extract_property_string(u"Район города:"))
        l.add_xpath('rennovation', self.extract_property_string(u"Ремонт:"))
        l.add_xpath('builtDate', self.extract_property_string(u"Год постройки:"))
        l.add_xpath('water', self.extract_property_string(u"Система водоснабжения:"))
        l.add_xpath('heating', self.extract_property_string(u"Система отопления:"))
        l.add_xpath('wc', self.extract_property_string(u"Санузел:"))
        l.add_xpath('walls', self.extract_property_string(u"Материал стен:"))
        l.add_xpath('ceilings', self.extract_property_string(u"Высота потолков:"))
        l.add_xpath('balcony', self.extract_property_string(u"Балкон"))
        l.add_xpath('security', self.extract_property_string(u"Охрана"))
        yield l.load_item()
コード例 #3
0
ファイル: avito.py プロジェクト: olegarch/hodim
 def parse_item_page(self, response):
     #filename = 'avito' + response.url[-10:] + '.html'
     #self.logger.info('filename %s',filename)
     #with open(filename, 'wb') as f:
     #    f.write(response.body)
         
     l = ApartmentLoader(Apartment(), response)
     l.add_value('url', response.url)
     
     # 1-к квартира, 42 м², 12/18 эт.
     # Студия, 28 м², 2/5 эт.
     # > 9-к квартира, 336 м², 23/23 эт
     #title = response.xpath('//h1[@itemprop="name" and @class="h1"]/text()').extract()
     #assert 1==len(title)        
     #m = re.search(u"(?:(\d+)-к квартира|(Студия)),\s+(\d+)\s+м²,\s+(\d+)/(\d+)\s+эт", title[0], flags=re.UNICODE)
     #print title[0].encode('utf-8')
     #print m.groups()
     
     #l.add_xpath('description', '//h1[@itemprop="name" and @class="h1"]/text()', re=u'(?:(\d+)-к квартира|(Студия))')
     l.add_xpath('rooms', '//h1[@itemprop="name" and @class="h1"]/text()', re=u'(Студия)')
     l.add_xpath('rooms', '//h1[@itemprop="name" and @class="h1"]/text()', re=u'(?:(\d+)-к квартира)')
     l.add_xpath('m2', '//h1[@itemprop="name" and @class="h1"]/text()', re=u',\s+(\d*\.\d+|\d+)\s+м²,')
     l.add_xpath('floor', '//h1[@itemprop="name" and @class="h1"]/text()', re=u'\s+(\d+)/\d+\s+эт')
     l.add_xpath('totfloors', '//h1[@itemprop="name" and @class="h1"]/text()', re=u'\s+\d+/(\d+)\s+эт')
     l.add_xpath('price', '//span[@itemprop="price"]/text()')
     
     l.add_xpath('city', '//meta[@itemprop="addressLocality"]/@content')
     l.add_xpath('district', '//span[@itemprop="streetAddress"]/text()')
     l.add_xpath('street', '//span[@itemprop="streetAddress"]/text()')
     
     description = ' '.join(response.xpath('//div[@class="description description-text"]/descendant::*/text()').extract())
     print description.encode('utf-8')
     
     l.add_xpath('description', '//div[@class="description description-text"]/descendant::*/text()')
     
     l.add_value('updated', datetime.utcnow().isoformat())
     l.add_xpath('postDate', '//div[@class="item-subtitle"]/text()')
     l.add_xpath('postDate', '//div[contains(@class,"item-subtitle")]/text()')
     yield l.load_item()