Esempio n. 1
0
class LivrosItem(scrapy.Item):
    titulo = scrapy.Item()
    preco = scrapy.Item()
    codigo = scrapy.Item()
    pagina = scrapy.Item()
    editora = scrapy.Item()
    autor = scrapy.Item()
    link = scrapy.Item()
Esempio n. 2
0
 def parse_item(self, response):
     self.logger.info('Hi, this is an item page! %s', response.url)
     item = scrapy.Item()
     item['id'] = response.xpath('//td[@id="item_id"]/text()').re(
         r'ID: (\d+)')
     item['name'] = response.xpath('//td[@id="item_name"]/text()').get()
     item['description'] = response.xpath(
         '//td[@id="item_description"]/text()').get()
     item['link_text'] = response.meta['link_text']
     return item
Esempio n. 3
0
 def parse_item(self, response):
     self.logger.info('Hi, this is an item page! %s', response.url)
     item = scrapy.Item()
     item['id'] = response.xpath('//td[@id="item_id"]/text()').re(
         r'ID: (\d+)')
     item['name'] = response.xpath(
         '//*[@id="ContentPlaceHolder_Header_HeadingBread_TagH1"]').get()
     item['description'] = response.xpath(
         '//td[@id="item_description"]/text()').get()
     return item
Esempio n. 4
0
    def parse(self, response):
	#print(response.css('.module-typeD .list-bullet').extract())
	#print(response.css('.module-typeD').extract())

	#titles	= response.xpath('//li[@class="item-title"]/a/text()').extract()
        
        item = scrapy.Item()
        items = []
        divOB = response.xpath('.//div[@id="rss-outbreaksUS"]').extract()
	      for p in divOB:
Esempio n. 5
0
    def parse_item(self, response):
        self.log('Hi, this is an item page! %s' % response.url)

        item = scrapy.Item()
        item['id'] = response.xpath('//td[@id="item_id"]/text()').re(
            r'ID: (\d+)')
        item['name'] = response.xpath('//td[@id="item_name"]/text()').extract()
        item['description'] = response.xpath(
            '//td[@id="item_description"]/text()').extract()
        return item
Esempio n. 6
0
 def parse_item(self, response):
     # TODO normal parser
     self.logger.info("Hi, this is an item page! %s", response.url)
     item = scrapy.Item()
     item["url"] = response.url
     item["title"] = response.xpath(
         '//h1[@data-qaid="title-h1"]/text()').extract_first().strip()
     item["price"] = response.xpath(
         '//span[@data-qaid="product-price"]/text()').extract_first().strip(
         )
     return item
Esempio n. 7
0
 def parse_item(self, response):
     # i = {}
     # i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     # i['name'] = response.xpath('//div[@id="name"]').extract()
     # i['description'] = response.xpath('//div[@id="description"]').extract()
     # return i
     self.logger.info('Hi, this is an item page! %s', response.url)
     item = scrapy.Item()
     item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
     item['name'] = response.xpath('//td[@id="item_name"]/text()').extract()
     item['description'] = response.xpath('//td[@id="item_description"]/text()').extract()
     return item
Esempio n. 8
0
 def parse_item(self, response):
     self.logger.info('Hi, this is an item page! %s', response.url)
     print(response)
     item = scrapy.Item()
     #item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
     #item['name'] = response.xpath('//td[@id="item_name"]/text()').extract()
     #item['description'] = response.xpath('//td[@id="item_description"]/text()').extract()
     #for c in response.xpath('//li'):
     #    item["content"] = c.text()
     #    print(item)
     #    yield item
     item["title"] = response.xpath('//title/text()')
Esempio n. 9
0
 def parse_item(self, response):
     self.logger.info('Hi, this is an item page! %s', response.url)
     item = scrapy.Item()
     item['id'] = response.xpath('//td[@id="usage-link-sentry"]/text()')
     item['name'] = response.xpath(
         '//td[@id="usage-image-sentry"]/text()').get()
     item['description'] = response.xpath(
         '//td[@class="package-githubcommits"]/text()').get()
     item['link_text'] = response.meta['/packages/p/sentry/']
     url = response.xpath('//td[@id="additional_data"]/@href').get()
     return response.follow(url,
                            self.parse_additional_page,
                            cb_kwargs=dict(item=item))
 def parse_item(self, response):
     self.logger.info('Hi, this is an item page! %s', response.url)
     item = scrapy.Item()
     item['id'] = response.xpath('//td[@id="item_id"]/text()').re(
         r'ID: (\d+)')
     item['name'] = response.xpath('//td[@id="item_name"]/text()').get()
     item['description'] = response.xpath(
         '//td[@id="item_description"]/text()').get()
     item['link_text'] = response.meta['link_text']
     url = response.xpath('//td[@id="additional_data"]/@href').get()
     return response.follow(url,
                            self.parse_additional_page,
                            cb_kwargs=dict(item=item))
Esempio n. 11
0
    def parse_item(self, response):
        item = scrapy.Item()
        yield item

        for url in response.xpath('//a/@href').extract():
            if url and not url.startswith('#'):
                url = urljoin(response.url, url)

                if url.lower().endswith(tuple(IGNORED_EXTENSIONS)):
                    continue

                url = url.split("?")[0]
                url = url.split("#")[0]

                if any(x in url for x in self.UNWANTED):
                    continue

                elif self.parsed_url.netloc in url and url not in self.LINKS:
                    try:
                        scrapy.http.Request(url,
                                            meta={
                                                'dont_redirect': True,
                                                'download_timeout': 20
                                            })
                    except Exception:  # pragma: no cover
                        continue

                    domain, subdomain, path = MySpider.format_url(url)

                    # partitionKey is agency+org+domain+subdomain+path
                    db_id = (f'{self.agency}+{self.organization}+'
                             f'{domain}+{subdomain}+{path}')

                    msg_body = json.dumps(
                        dict(Agency=self.agency,
                             Organization=self.organization,
                             domain=domain,
                             subdomain=subdomain,
                             tld='gov',
                             routeable_url=url,
                             db_id=db_id))

                    entry = {'Id': '1', 'MessageBody': msg_body}
                    send_message(entry)

                    self.LINKS.add(url)
    def test_process_unknown_item(self, pipeline, spider):
        item = scrapy.Item()

        processed_item = pipeline.process_item(item, spider)

        assert processed_item == item
Esempio n. 13
0
 def parse_item(self, response):
     item = scrapy.Item()
     return item
Esempio n. 14
0
 def parse_item(self, response):
     item = scrapy.Item()
     item['name'] = response.xpath('.//@data-name').extract()[0]
     item['tweet_count'] = response.css(
         '.ProfileNav-value::text').extract()[0]
     return item
Esempio n. 15
0
    def parse_rooms(self, response):
        res = requests.get(response.url, headers={'User-Agent': 'Mozilla/5.0'}).text
        soup = BeautifulSoup(res, 'html.parser')

        # price
        price = soup.find('span', {'class': 'room__sidebar--rate-base'}).text.strip()

        # building name
        building_name = soup.find('h5', {'class': 'room__location--title'}).text.strip()

        # room name
        room_name = soup.find('h1', { 'class': 'room__title'}).text.strip()

        # room features
        room_features = soup.find('div', {'class': 'room__features'}).get_text().split('\n')
        self.remove_empty_strings(room_features)

        print('crawling -> '+ response.url)

        # capacity
        capacity = soup.select('#body > div.global-wrapper > main > section.section-room > div.row > div.columns.small-12.medium-4.room-sidebar > div > div.room__sidebar--form-wrapper.loader-wrapper > div.room__sidebar--icons > ul > li:nth-child(1)')[0].text.strip()

        # location
        location = soup.find('div', { 'class': 'address'}).get_text().strip()

        # city of location
        cityOfLocation = self.get_city_of_location(location)

        # print('++++++++++++++++++++')
        # print('Room Name: '+ room_name)
        # print('++++++++++++++++++++')
        # print('++++++++++++++++++++')
        # print('Building Name: '+ building_name)
        # print('++++++++++++++++++++')
        # print('++++++++++++++++++++')
        # print('Room Features: ')
        # print(room_features)
        # print('++++++++++++++++++++')
        # print('++++++++++++++++++++')
        # print('price: '+ price)
        # print('++++++++++++++++++++')
        # print('++++++++++++++++++++')
        # print('Capacity: '+ capacity)
        # print('++++++++++++++++++++')
        # print('++++++++++++++++++++')
        # print('Location: '+ location)
        # print('++++++++++++++++++++')
        # print('++++++++++++++++++++')
        # print('City: '+ cityOfLocation)
        # print('++++++++++++++++++++')

        item = scrapy.Item()
        item.fields['city'] = cityOfLocation
        item.fields['buildingName'] = building_name
        item.fields['roomName'] = room_name
        item.fields['price'] = float(price)
        item.fields['capacity'] = int(capacity)
        item.fields['roomFeatures'] = room_features
        item.fields['location'] = location
        self.visited_url[response.url] = None

        yield item.fields
Esempio n. 16
0
 def parse(self, response):      
     self.log('get response size: %s' % len(response.body))
     item = scrapy.Item()
     return item
Esempio n. 17
0
 def parse_item(self, response):
     self.logger.info('Hi, this is an item page! %s', response.url)
     item = scrapy.Item()
Esempio n. 18
0
class MercadolibreItem(scrapy.Item):
    titulo = scrapy.Item()
    descripcion = scrapy.Item()
    condiciones = scrapy.Item()
    precio = scrapy.Item()
    color = scrapy.Item()
    disponible = scrapy.Item()
    imagen_url = scrapy.Item()
    ubicacion = scrapy.Item()
    reputacion = scrapy.Item()
    antiguedad_mercadolibre = scrapy.Item()
    ventas_concretadas = scrapy.Item()
    url = scrapy.Item()