def processItem(self, response): title = response.xpath('//span[@id="titletextonly"]/text()').get() price = response.xpath('//span[@class="price"]/text()').get() item = OlxItem() item['title'] = title item['price'] = price yield item
def parse_detail_page(self, response): title = response.css('h1::text').extract()[0].strip() price = response.css('.pricelabel > strong::text').extract()[0] item = OlxItem() item['title'] = title item['price'] = price item['url'] = response.url yield item
def parse_detail_page(self, response): title = response.css("h1::text").extract()[0].strip() price = response.css(".pricelabel > strong::text").extract()[0] item = OlxItem() item["title"] = title item["price"] = price item["url"] = response.url yield item
def parse_detail_page(self, response): print('--******__ entrou no detalhe') title = response.css('h1::text').extract()[0].strip() price = response.css('.pricelabel > .xxxx-large').extract()[0] item = OlxItem() item['title'] = title item['price'] = price item['url'] = response.url print('ah caraio') yield item
def parse_item(self, response): if OlxSpider.TARGET == OlxSpider.count: # throw error when target ready raise CloseSpider('Exceeded maximum items') # Data cleaner cost = response.css('strong.pricelabel__value::text').get() price = cost[:cost.rfind(' ')] currency = cost[cost.rfind(' ') + 1:] params = (dict(zip(response.css('a.offer-details__param--url span::text').getall(), response.css('a.offer-details__param--url strong::text').getall()))) description = response.xpath('//div[@id="textContent"]//text()').extract() res_desc = '' for _str in description: _str = _str.strip().replace('\\n', ' ').replace('\\r', ' ').replace('\\t', ' ') res_desc += _str if response.css('ul#descGallery li a::attr(href)').getall(): pics = response.css('ul#descGallery li a::attr(href)').getall() else: pics = response.css('div#descImage img::attr(src)') delivery = response.css('span.olx-delivery-badge-icon-wrapper').get() if delivery is not None: delivery = 'Есть Доставка OLX' else: delivery = 'Нет Доставка OLX' # end # Save Data item = OlxItem() item['product_url'] = response.url item['name'] = (response.css('div h1::text').get())[9:-3] item['category'] = response.css('td.middle ul li span::text').getall() item['price'] = float(price) item['price_currency'] = currency item['date_of_creation'] = response.css('em strong::text').get() item['count_views'] = int(response.css('span.offer-bottombar__counter strong::text').get()) item['prod_id'] = response.css('ul.offer-bottombar__items li>strong::text').get() item['seller_name'] = response.css('div.offer-user__actions a::text').get().strip() item['seller_adress'] = response.css('address p::text').get() item['state'] = params.get('Состояние', 'Not Value') item['description'] = res_desc item['pics_urls'] = pics item['olx_delivery'] = delivery OlxSpider.count += 1 # end yield item
def parse_detail_page(self, response): print("#############################Page Detail######################") title = response.css( 'h1[data-aut-id="itemTitle"]::text').extract()[0].strip() price = response.css( 'span[data-aut-id="itemPrice"]::text').extract()[0] image = response.css('figure *::attr("src")').extract_first() item = OlxItem() item['title'] = title item['price'] = price item['url'] = response.url item['image'] = image yield item
def parse_item_page(self, response): url_info = urlparse(response.url) price = response.xpath( "//strong[contains(@class, 'pricelabel')]//text()").extract_first( ) description = ' '.join( response.xpath("//div[@id='textContent']//text()").extract()) return OlxItem(uid=url_info.path.split('-')[-1], title=response.xpath('//title/text()').extract_first(), price=price, url=response.url, description=description, has_backyard='curte' in description)
def parse(self, response): rows = response.css('table[summary="Anunt"]') for row in rows: title = (row.css('h3>a.detailsLink>strong::text').extract_first() or row.css('h3>a.detailsLinkPromoted>strong::text' ).extract_first()).strip() price = row.css('.price > strong::text').extract_first().strip() href = row.css( 'h3>a.detailsLinkPromoted::attr(href),a.detailsLink::attr(href)' ).extract_first() id = href.split('-')[-1].split('.')[0] item = OlxItem() item['id'] = id item['title'] = title item['price'] = price item['url'] = href yield scrapy.Request(href, callback=self.parse_detail_page, meta={'item': item})
def parse_detail_page(self, response): # title = response.css('h1::text').extract()[0].strip() title = response.css('.product-productname ::text').extract()[0].strip() old_price = response.css('.old-price > .price ::text').extract()[0].strip() new_price = response.css('.special-price > .price ::text').extract()[0].strip() #print(title) discount = response.css('.view-percent-price > strong ::text').extract()[0].strip() #price = response.css('.pricelabel > strong::text').extract()[0] #print(price) item = OlxItem() item['title'] = title item['percentage'] = discount item['old_price'] = old_price item['new_price'] = new_price print("begin") print(title) print(discount) print(old_price) print(new_price) print("end") # print(response.url) item['url'] = response.url yield item
def grab(self, response): item = OlxItem() item['title'] = response.css( 'div.offer-titlebox h1::text').extract_first().strip() item['address'] = response.css( 'a.show-map-link > strong::text').extract_first().strip() item['pub_date'] = ''.join( response.css('em::text').extract()).strip().split(',')[-2].strip() item['mark'] = response.css('table.item > tr:contains("Марка") a::text' ).extract_first('NaN').strip() item['model'] = response.css( 'table.item > tr:contains("Модель") a::text').extract_first( 'NaN').strip() item['year'] = response.css( 'table.item > tr:contains("Год выпуска") strong::text' ).extract_first('NaN').strip() item['mileage'] = ''.join( response.css('table.item > tr:contains("Пробег") strong::text'). extract_first('NaN').strip().split(' ')[:-1]) item['body_type'] = response.css( 'table.item > tr:contains("Тип кузова") strong > a::text' ).extract_first('NaN').strip() item['color'] = response.css( 'table.item > tr:contains("Цвет") strong > a::text').extract_first( 'NaN').strip() opt = [ i.strip() for i in response.css( 'table.item > tr:contains("Доп. опции") strong > a::text'). extract() ] if opt: item['add_opt'] = opt else: item['add_opt'] = 'NaN' item['fuel'] = response.css( 'table.item > tr:contains("Вид топлива") strong > a::text' ).extract_first('NaN').strip() item['engine_vol'] = ''.join( response.css( 'table.item > tr:contains("Объем двигателя") strong::text'). extract_first('NaN').strip().split()[:-1]) item['gearbox'] = response.css( 'table.item > tr:contains("Коробка передач") strong > a::text' ).extract_first('NaN').strip() cond = [ i.strip() for i in response.css( 'table.item > tr:contains("Состояние машины") strong > a::text' ).extract() ] if cond: item['condition'] = cond else: item['condition'] = 'NaN' item['cleared'] = response.css( 'table.item > tr:contains("Растаможена") strong > a::text' ).extract_first('NaN').strip() mult = [ i.strip() for i in response.css( 'table.item > tr:contains("Мультимедиа") strong > a::text'). extract() ] if mult: item['multimedia'] = mult else: item['multimedia'] = 'NaN' sec = [ i.strip() for i in response.css( 'table.item > tr:contains("Безопасность") strong > a::text'). extract() ] if sec: item['security'] = sec else: item['security'] = 'NaN' oth = [ i.strip() for i in response.css( 'table.item > tr:contains("Прочее") strong > a::text').extract( ) ] if oth: item['other'] = oth else: item['other'] = 'NaN' item['owner_note'] = ' '.join( [i.strip() for i in response.css('#textContent::text').extract()]) item['views'] = response.css( 'div.pdingtop10 > strong::text').extract_first('NaN').strip() item['price'] = ''.join( response.css('strong.xxxx-large::text').extract_first().strip(). split()[:-1]) item['currency'] = response.css( 'strong.xxxx-large::text').extract_first().strip().split()[-1] yield item
def get_item_data(self, response): item = OlxItem() try: token = re.search("var phoneToken = '[a-zA-Z0-9]+", response.text).group(0)[18:] data = response.xpath( '//ul[@id="contact_methods_below"]/li/@class').get() uid = data.strip( 'link-phone clr rel atClickTracking contact-a activated') uid = json.loads(uid.replace("'", '"'))['id'] except: item['phone_number'] = None else: url = f'https://www.olx.ua/uk/ajax/misc/contact/phone/{uid}/?pt={token}' yield scrapy.Request(url=url, callback=self.get_phone_numbers, cb_kwargs=dict(item_obj=item)) try: photo_urls = [] for i in response.xpath('//ul[@id="descGallery"]/li'): url = i.xpath('./a/@href').get() photo_urls.append(url) item['photo_urls'] = photo_urls except AttributeError: item['photo_urls'] = None price = get_item_or_none( response.xpath( '//strong[@class="pricelabel__value arranged"]/text()').get()) if not price: price = get_item_or_none( response.xpath( '//div[@class="offer-titlebox__price"]/div/strong/text()'). get()) item['price'] = price user_name = get_item_or_none( response.xpath( '//div[@class="offer-user__actions"]/h4/a/text()').get()) if not user_name: user_name = get_item_or_none( response.xpath( '//div[@class="offer-user__actions"]/h4/text()').get()) item['user_name'] = user_name user_url = get_item_or_none( response.xpath( '//div[@class="offer-user__actions"]/h4/a/@href').get()) if not user_url: user_url = get_item_or_none( response.xpath('//ul[@id="contact_methods"]/li/a/@href').get()) item['user_url'] = user_url item['ad_url'] = response.url item['description'] = get_item_or_none( response.xpath('//div[@class="clr lheight20 large"]/text()').get()) item['title'] = get_item_or_none( response.xpath('//div[@class="offer-titlebox"]/h1/text()').get()) item['address'] = get_item_or_none( response.xpath( '//div[@class="offer-user__address"]/address/p/text()').get()) item['date_time'] = get_item_or_none( response.xpath( '//li[@class="offer-bottombar__item"]/em/strong/text()').get() )[2:] item['ad_number'] = get_item_or_none( response.xpath( '//li[@class="offer-bottombar__item"]/strong/text()').get()) if not 'phone_number' in item or not item['phone_number']: phone_numbers = [] all_text_data = item['user_url'] + \ item['user_name'] + item['description'] numbers = find_phone_number(all_text_data) if not numbers: item['phone_number'] = None else: phone_numbers.append(numbers) item['phone_number'] = [i for i in numbers] yield item
def parse_detail(self, response): #self.log(u'Imóvel URL: {0}'.format(response.url)) imovel = OlxItem() imovel['url'] = response.url imovel['titulo'] = response.xpath( 'normalize-space(//h1[contains(@id,"ad_title")]//.)' ).extract_first() data = response.xpath( 'normalize-space(//div[contains(@class,"OLXad-date")]//p)').re( "Inserido em: (\d*) (\w*)") imovel['data'] = date(date.today().year, self.converteMes[data[1]], int(data[0])) preco = response.xpath( 'normalize-space(//span[contains(@class,"actual-price")])').re( "R\$ (.*)") preco = (preco and preco[0]) or 0 if preco != 0: imovel['preco'] = int(re.sub('[^0-9]', '', preco)) else: imovel['preco'] = preco imovel['descricao'] = response.xpath( 'normalize-space(//div[contains(@class,"OLXad-description")]//p)' ).extract_first() detalhes = response.xpath( '//div[contains(@class, "OLXad-details")]//li[contains(@class, "item")]' ) atributo = None valor = None for i, detalhe in enumerate(detalhes): atributo = detalhe.xpath( 'normalize-space(.//span[contains(@class, "term")]/text())' ).extract_first() valor = detalhe.xpath( 'normalize-space(.//strong[contains(@class, "description")]/text())' ).extract_first() if (atributo == 'Tipo:'): imovel['tipo'] = valor elif (atributo == 'Área útil:'): area = int(re.sub('[^0-9]', '', valor)) imovel['area_util'] = area elif (atributo == 'Área construída:'): area = int(re.sub('[^0-9]', '', valor)) imovel['area_construida'] = area elif (atributo == 'Quartos:'): imovel['n_quartos'] = valor elif (atributo == 'Vagas na garagem:'): imovel['vagas_garagem'] = valor elif (atributo == 'Condomínio:'): imovel['condominio'] = valor localizacao = response.xpath( '//div[contains(@class, "OLXad-location")]//li[contains(@class, "item")]' ) atributo = None valor = None for i, loc in enumerate(localizacao): atributo = loc.xpath( 'normalize-space(.//span[contains(@class, "term")]/text())' ).extract_first() valor = loc.xpath( 'normalize-space(.//strong[contains(@class, "description")]/text())' ).extract_first() if (atributo == 'Município:'): imovel['municipio'] = valor elif (atributo == 'CEP do imóvel:'): imovel['cep'] = valor elif (atributo == 'Bairro:'): imovel['bairro'] = valor imovel['id'] = response.xpath( 'normalize-space(//div[contains(@class, "OLXad-id")]//p//strong)' ).extract_first() yield imovel
class MySpider(CrawlSpider): name = "olxMumbai" allowed_domains = ['www.olx.in'] start_urls = ['https://www.olx.in/mumbai/real-estate/'] item = OlxItem() def parse(self, response): hxs = Selector(response) data = hxs.xpath( '//*[@id="offers_table"]/tbody/tr/td[contains(@class,"offer")]') for i in data: typ = i.xpath('table/tbody/tr/td[@valign="top"]/p/small/text()' ).extract_first().strip() if (('Apartments' in typ) or ('Shops' in typ) or ('Houses' in typ)): url = i.xpath('table/tbody/tr/td[@valign="top"]/h3/a/@href' ).extract_first() yield Request(url, callback=self.parse1, dont_filter=True) if 'Next page' in response.xpath( '//div[@class="pager rel clr"]/span[last()]/a/span/text()' ).extract_first(): next_url = response.xpath( '//div[@class="pager rel clr"]/span[last()]/a/@href' ).extract_first() yield Request(next_url, callback=self.parse) def parse1(self, response): hxs = Selector(response) ''' Assigning default value ''' self.item['Selling_price'] = '0' self.item['Monthly_Rent'] = '0' self.item['lat'] = '0' self.item['longt'] = '0' self.item['Bua_sqft'] = '0' self.item['carpet_area'] = '0' self.item['price_per_sqft'] = '0' self.item['management_by_landlord'] = 'None' self.item['areacode'] = 'None' self.item['mobile_lister'] = 'None' self.item['google_place_id'] = 'None' self.item['Launch_date'] = 'None' self.item['Possession'] = '0' self.item['age'] = 'None' self.item['address'] = 'None' self.item['price_on_req'] = 'false' self.item['sublocality'] = 'None' self.item['config_type'] = 'None' self.item['listing_date'] = dt.now().strftime('%m/%d/%Y %H:%M:%S') self.item['updated_date'] = self.item['listing_date'] self.item['txn_type'] = 'None' self.item['property_type'] = 'None' self.item['Building_name'] = 'None' self.item['locality'] = 'None' self.item['price_per_sqft'] = '0' self.item['Bua_sqft'] = '0' self.item['Status'] = 'None' self.item['listing_by'] = 'None' self.item['name_lister'] = 'None' self.item['Details'] = 'None' self.item['city'] = 'mumbai' self.item['platform'] = 'olx' self.item['data_id'] = response.xpath( '//span[@class="rel inlblk"]/text()').extract_first().strip() lat_long = response.xpath( '//div[@id="mapcontainer"]/@class').extract_first() self.item['lat'] = re.findall(" lat: '([0-9.]+)'", lat_long)[0] self.item['longt'] = re.findall(" lon: '([0-9.]+)'", lat_long)[0] self.item['locality'] = response.xpath( '//strong[@class="c2b small"]/text()').extract_first().strip() typ = response.xpath( '//*[@id="breadcrumbTop"]/tr/td/ul/li[3]/a/span/text()' ).extract_first().strip() if 'ale' in typ: self.item['txn_type'] = 'Sale' if 'ent' in typ: self.item['txn_type'] = 'Rent' if 'ale' in self.item['txn_type']: self.item['Selling_price'] = response.xpath( '//strong[@class="xxxx-large margintop7 inlblk not-arranged"]/text()' ).extract_first() self.item['Monthly_Rent'] = '0' if 'ent' in self.item['txn_type']: self.item['Monthly_Rent'] = response.xpath( '//strong[@class="xxxx-large margintop7 inlblk not-arranged"]/text()' ).extract_first() self.item['Selling_price'] = '0' prp_typ = response.xpath( '//*[@id="breadcrumbTop"]/tr/td/ul/li[4]/a/span/text()' ).extract_first().strip() if (('Apartments' in prp_typ) or ('Houses' in prp_typ)): self.item['property_type'] = 'Residential' if ('Shops' in prp_typ): self.item['property_type'] = 'Commercial' try: conf = response.xpath( '//a[contains(@title,"room")]/text()').extract_first().strip() if (not conf == None): self.item['config_type'] = re.findall('[0-9]', conf)[0] + 'BHK' except: try: conf1 = response.xpath('//a[contains(@title,"more")]/text()' ).extract_first().strip() if (not conf1 == None): self.item['config_type'] = re.findall('[0-9]', conf1)[0] + 'BHK' except: print 'No config ' + ' -->>' + str(response.url) self.item['config_type'] = 'None' dates = response.xpath( '//span[@class="pdingleft10 brlefte5"]/text()').extract() date1 = ' '.join( re.findall('[\S]+', [date for date in dates if re.findall('[\w]', date) ][0])).replace(',', '').replace('on ', '').replace( 'at ', '').replace('Added ', '') if 'terday' in date1: self.item['listing_date'] = str( (d.today() - timedelta(days=1)).month) + "/" + str( (d.today() - timedelta(days=1)).day) + "/" + str( (d.today() - timedelta(days=1)).year) + ' 00:00:00' elif 'oday' in date1: self.item['listing_date'] = str(d.today().month) + '/' + str( d.today().day) + '/' + str(d.today().year) + ' 00:00:00' elif ((' am' in date1) or (' pm' in date1)): self.item['listing_date'] = str(d.today().month) + '/' + str( d.today().day) + '/' + str( d.today().year) + ' ' + date1.replace(' am', '').replace( ' pm', '') + ':00' else: self.item['listing_date'] = dt.strftime( dt.strptime(date1, '%d %b'), '%m/%d') + '/' + str( d.today().year) + ' 00:00:00' self.item['updated_date'] = self.item['listing_date'] try: area = response.xpath('//strong[@class="block"]/text()').extract() get_area = [sqf for sqf in area if ' ft' in sqf] if get_area: self.item['Bua_sqft'] = re.findall('[0-9,]+', get_area[0].strip())[0] if ',' in self.item['Bua_sqft']: self.item['Bua_sqft'] = self.item['Bua_sqft'].replace( ',', '') except: print 'No Sqft -->>' + str(response.url) self.item['scraped_time'] = dt.now().strftime('%m/%d/%Y %H:%M:%S') if (((not self.item['Monthly_Rent'] == '0') and (not self.item['Bua_sqft'] == '0') and (not self.item['Building_name'] == 'None') and (not self.item['lat'] == '0')) or ((not self.item['Selling_price'] == '0') and (not self.item['Bua_sqft'] == '0') and (not self.item['Building_name'] == 'None') and (not self.item['lat'] == '0')) or ((not self.item['price_per_sqft'] == '0') and (not self.item['Bua_sqft'] == '0') and (not self.item['Building_name'] == 'None') and (not self.item['lat'] == '0'))): self.item['quality4'] = 1 elif (((not self.item['price_per_sqft'] == '0') and (not self.item['Building_name'] == 'None') and (not self.item['lat'] == '0')) or ((not self.item['Selling_price'] == '0') and (not self.item['Bua_sqft'] == '0') and (not self.item['lat'] == '0')) or ((not self.item['Monthly_Rent'] == '0') and (not self.item['Bua_sqft'] == '0') and (not self.item['lat'] == '0')) or ((not self.item['Selling_price'] == '0') and (not self.item['Bua_sqft'] == '0') and (not self.item['Building_name'] == 'None')) or ((not self.item['Monthly_Rent'] == '0') and (not self.item['Bua_sqft'] == '0') and (not self.item['Building_name'] == 'None'))): self.item['quality4'] = 0.5 else: self.item['quality4'] = 0 if ((not self.item['Building_name'] == 'None') and (not self.item['listing_date'] == '0') and (not self.item['txn_type'] == 'None') and (not self.item['property_type'] == 'None') and ((not self.item['Selling_price'] == '0') or (not self.item['Monthly_Rent'] == '0'))): self.item['quality1'] = 1 else: self.item['quality1'] = 0 if ((not self.item['Launch_date'] == '0') or (not self.item['Possession'] == '0')): self.item['quality2'] = 1 else: self.item['quality2'] = 0 if ((not self.item['mobile_lister'] == 'None') or (not self.item['listing_by'] == 'None') or (not self.item['name_lister'] == 'None')): self.item['quality3'] = 1 else: self.item['quality3'] = 0 yield self.item
def parse(self, response): for ref in response.css('.sku.-gallery'): item = OlxItem() #item['title']= print('title') print('-'.join(ref.css('.link>.title ::text').extract())) item['title'] = '-'.join(ref.css('.link>.title ::text').extract()) #print('url') print(ref.css('.link ::attr(href)').extract()[0]) item['url'] = ref.css('.link ::attr(href)').extract()[0] print('percentage') #item['percentage']= if (ref.css( '.link>.price-container.clearfix>.sale-flag-percent ::text' ).extract()): toPrint = ref.css( '.link>.price-container.clearfix>.sale-flag-percent ::text' ).extract()[0] #print(re.findall('\d+', toPrint)) print(toPrint) item['percentage'] = ref.css( '.link>.price-container.clearfix>.sale-flag-percent ::text' ).extract()[0] #else: # print('no percentage') #item['old_price']= print('old_price') print((ref.css( '.link>.price-container.clearfix>.price-box.ri>.price.-old ::text' ).extract()[2]).strip()) #if len(ref.css('.link>.price-container.clearfix>.price-box.ri>.price.-old ::text').extract().length)>2 item_old_price = (ref.css( '.link>.price-container.clearfix>.price-box.ri>.price.-old ::text' ).extract()[2]).strip() x = re.findall('\\d+', item_old_price) y = x[0] + x[1] print(y) print("old_price") item['old_price'] = y #else: # print ('no old price') print('new_price') #item['new_price']= #if len(ref.css('.link>.price-container.clearfix>.price-box.ri>.price ::text').extract())>2: print( ref.css( '.link>.price-container.clearfix>.price-box.ri>.price ::text' ).extract()[2].strip()) item_new_price = (ref.css( '.link>.price-container.clearfix>.price-box.ri>.price ::text'). extract()[2]).strip() x = re.findall('\\d+', item_new_price) y = x[0] + x[1] print(y) print("new Price") item['new_price'] = y #else: # print('no new price') yield item next_page = response.css( '.pagination>.osh-pagination.-horizontal ::attr(href)').extract( )[-1] print(next_page) if self.prev_page == next_page: pass else: self.prev_page = next_page yield scrapy.Request(next_page, self.parse, dont_filter=True)