def parse(self, response):
        productos = response.css('div.product-tile-inner')

        print(f"\n\n{len(productos)}\n\n")

        for producto in productos:
            existe_producto = len(producto.css('div.detail'))
            if (existe_producto > 0):
                producto_loader = ItemLoader(item=ProductoFybeca(),
                                             selector=producto)
                producto_loader.add_css('titulo', 'a.name::text')
                producto_loader.add_xpath(
                    'precio_1',
                    'div[contains(@class,"detail")]/div[@class="side"]/div[@class="price-member"]/div/@data-bind'
                )
                producto_loader.add_xpath(
                    'precio_0',
                    'div[contains(@class,"detail")]/div[@class="side"]/div[@class="price"]/@data-bind'
                )
                producto_loader.add_xpath(
                    'imagen',
                    'div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src'
                )
                yield producto_loader.load_item()
Esempio n. 2
0
 def parse_row(self, response, row):
     print(row)
     il = ItemLoader(item=IlHospitalLicensesSpiderItem())
     # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value(
         'url',
         'https://data.illinois.gov/dataset/410idph_hospital_directory/resource/9bdedb85-77f3-490a-9bbd-2f3f5f227981'
     )
     il.add_value('sourceName', 'IL_Home_Nursing_Agency_Licenses')
     il.add_value('permit_type', "nursing_home_license")
     name = self._getDBA(row['Home Nursing Agencies'])
     company_name = str(name[0]).replace(' -', '') if ' -' in str(
         name[0]) else name[0]
     address = self.format__address_4(
         row['Address'], row['City'], 'IL',
         str(row['Zip']) if '.' not in str(row['Zip']) else str(
             row['Zip'])[:str(row['Zip']).rfind('.')])
     il.add_value('dba_name', name[1])
     il.add_value('permit_lic_no', row.get('License #', ''))
     il.add_value(
         'permit_lic_exp',
         self.format_date(row.get('Exp. Date', ''))
         if row.get('Exp. Date') else '')
     il.add_value('company_name', company_name)
     il.add_value('location_address_string', address)
     il.add_value('county', row.get('County', ''))
     il.add_value(
         'permit_lic_desc', "Medical License for " +
         company_name if company_name else "Medical License")
     il.add_value('company_phone', row.get('Phone', ''))
     il.add_value(
         'company_subtype',
         row.get('Type', '')
         if row.get('Type', '') else 'Nursing Home License')
     yield il.load_item()
Esempio n. 3
0
    def parse_busi_art(self, res):
        tag = res.meta['tag']

        url = res.url
        main = res.css('.container.js-social-anchor-start')
        ci = ItemLoader(item=CNN(), selector=main)

        ci.add_value('tag', tag)
        ci.add_value('crawled_at', self.crawled_at)
        ci.add_value('url', url)

        ci.add_css('title', 'h1.article-title.speakable::text')

        ci.add_xpath('timestamp', './/span[@class="cnnDateStamp"]/text()')

        img_ = main.xpath('.//div[@id="storytext"]//img/@src').extract()

        ci.add_value('image_urls', img_)
        ci.add_css('summary', 'h2.speakable::text')

        ci.add_xpath('text', './/p/text()')
        ci.add_value('source', self.source)

        return ci.load_item()
Esempio n. 4
0
    def parse_detail(self, response):
        # 从request_url 中提取question_item内容
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css('title', '.QuestionHeader-title::text')
        item_loader.add_css('content', '.QuestionHeader-detail')
        item_loader.add_value('url', response.url)
        question_id = response.meta.get('question_id', '')
        item_loader.add_value('question_id', question_id)
        item_loader.add_css('answer_num', 'h4.List-headerText span::text')
        item_loader.add_css('click_num', '.NumberBoard-itemValue::attr(title)')
        item_loader.add_css('comment_num',
                            '.QuestionHeader-Comment button::text')
        item_loader.add_css('watch_user_num',
                            '.NumberBoard-itemValue::attr(title)')
        item_loader.add_css('topics', '.Popover div::text')
        item_loader.add_value(
            'crawl_time',
            datetime.datetime.now().strftime(SQL_DATETIME_FORMAT))

        question_item = item_loader.load_item()
        yield scrapy.Request(url=self.start_answer_url.format(
            question_id, 20, 0),
                             callback=self.parse_answer)
        yield question_item
    def parse(self, response):
        productos = response.css('div.product-tile-inner')
        for producto in productos:
            detalles = producto.css('div.detail')
            tiene_detalles = len(detalles) > 0
            if (tiene_detalles):  # valida si existe el detalle del producto
                producto_loder = ItemLoader(  # instancia que carga propiedades del item 
                    item=ProductoFybeca(),  # clase item
                    selector=producto  # selector por defecto
                )

                producto_loder.default_output_processor = TakeFirst(
                )  # no guarda como arreglo

                producto_loder.add_css(
                    'titulo',  # nombre de la propiedad del item
                    'a.name::text'  # css que contiene el dato que se le quiere dar al nombre de la propiedad del item
                )
                producto_loder.add_xpath(
                    'imagen',
                    'div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src'  # xpath que contiene el dato
                )

                yield producto_loder.load_item()
Esempio n. 6
0
    def parse(self, response):

        # Get the list of products from html response
        products_list = response.xpath('//div[@data-search-results=""]/div//li//a/@href').extract()
        products_id_list = [product_href.split("/")[4] for product_href in products_list]

        # For each product extracts the product URL
        print(f"#### FOUND {len(products_id_list)} PRODUCTS")

        if self.URLS_ONLY:
            for product_id in products_id_list:

                # Create the ItemLoader object that stores each product information
                l = ItemLoader(item=ProductItem(), response=response)

                product_url = f'https://www.etsy.com/listing/{product_id}'
                l.add_value('url', product_url)
                yield l.load_item()

        else:
            for product_id in products_id_list:
                product_url = f'https://www.etsy.com/listing/{product_id}'
                # Stops if the COUNTER reaches the maximum set value
                if self.COUNTER < self.COUNT_MAX:
                    # Go to the product's page to get the data
                    yield scrapy.Request(product_url, callback=self.parse_product, dont_filter=True)

        # Pagination - Go to the next page
        current_page_number = int(response.url.split('=')[-1])
        next_page_number = current_page_number + 1
        # Build the next page URL
        next_page_url = '='.join(response.url.split('=')[:-1]) + '=' + str(next_page_number)

        # If the current list is not empty
        if len(products_id_list) > 0:
            yield scrapy.Request(next_page_url)
Esempio n. 7
0
 def parse(self, response):
     json_data = json.loads(response.text)
     twitters = json_data[0]['card_group']
     for twitter in twitters:
         if twitter.get('mblog'):
             loader = ItemLoader(item=WeiboItem())
             loader.default_output_processor = Join()
             try:
                 loader.add_value('user_name',
                                  twitter['mblog']['user']['screen_name'])
                 loader.add_value('time', twitter['mblog']['created_at'])
                 loader.add_value('comments',
                                  str(twitter['mblog']['comments_count']))
                 loader.add_value('likes',
                                  str(twitter['mblog']['attitudes_count']))
                 loader.add_value('text', twitter['mblog']['text'])
                 if twitter['mblog'].get('retweeted_status'):
                     loader.add_value('type', '转发')
                 elif twitter['mblog'].get('page_info'):
                     if twitter['mblog']['page_info'].get('video_details'):
                         loader.add_value('type', '视频')
                     else:
                         loader.add_value('type', '原创')
                 else:
                     loader.add_value('type', '原创')
                 yield loader.load_item()
             except KeyError:
                 self.logger.error('KeyError')
         else:
             self.logger.info('No mblog key')
     # go to next link
     next_link = json_data[0]['next_cursor']
     yield scrapy.Request(
         'https://m.weibo.cn/feed/friends?version=v4&next_cursor={}&page=1'.
         format(str(next_link)),
         callback=self.parse)
Esempio n. 8
0
    def parse_article(self, response, date):
        if 'pdf' in response.url:
            return

        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//h3[@class="bronze"]/text()').get()
        if title:
            title = title.strip()

        content = response.xpath(
            '//div[contains(@class,"content")][h3]//text()').getall()
        content = [
            text for text in content if text.strip() and '{' not in text
        ]
        content = "\n".join(content).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
    def parse_item(self, response):
        """ This function parses a property page.

        @url http://scrapybook.s3.amazonaws.com/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
                    MapCompose(str.strip, str.title))
        l.add_xpath('price',
                    './/*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
                    MapCompose(str.strip), Join())
        l.add_xpath('address',
                    '//*[@itemtype="http://schema.org/Place"][1]/text()',
                    MapCompose(str.strip))
        make_url = lambda i: response.urljoin(i)
        l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
                    MapCompose(make_url))

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
Esempio n. 10
0
    def parse(self, response):
        # with open('initialResp.json', 'wb') as f:
        #     f.write(response.body)
        current_page = response.request.meta['currentPage']
        json_resp = json.loads(response.body)
        houses = json_resp.get('searchResults').get('listResults')
        for house in houses:
            loader = ItemLoader(item=ZillowItem())
            loader.add_value('id', house.get('id'))
            loader.add_value('image_urls', house.get('imgSrc'))
            loader.add_value('detail_url', house.get('detailUrl'))
            loader.add_value('status_type', house.get('statusType'))
            loader.add_value('status_text', house.get('statusText'))
            loader.add_value('price', house.get('price'))
            loader.add_value('address', house.get('address'))
            loader.add_value('beds', house.get('beds'))
            loader.add_value('baths', house.get('baths'))
            loader.add_value('area_sqft', house.get('area'))
            loader.add_value('latitude', house.get('latLong').get('latitude'))
            loader.add_value('longitude', house.get('latLong').get('longitude'))
            loader.add_value('broker_name', house.get('brokerName'))
            loader.add_value('broker_phone', house.get('brokerPhone'))

            yield loader.load_item()

        total_pages = json_resp.get('searchList').get('totalPages')
        if current_page <= total_pages:
            current_page += 1
            yield scrapy.Request(
                url=parse_new_url(URL, page_number=current_page),
                callback=self.parse,
                cookies=cookie_parser(),
                meta={
                    'currentPage': current_page
                }
            )
Esempio n. 11
0
    def parse_article(self, response):
        if 'pdf' in response.url:
            return

        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//h1/text()').get()
        date = ''
        if title:
            title = title.strip()
            if title[:2].isnumeric():
                date = title.split()[0]

        content = response.xpath('//div[@class="text"]//text()').getall()
        content = [text for text in content if text.strip()]
        content = "\n".join(content).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
Esempio n. 12
0
 def parse(self, response):
     """This function parses a property page
     @url https://nj.58.com/ershoufang/pn3/?PGTID=0d30000c-000a-c568-cd81-f02b4ffbea21&ClickID=1
     @returns items 1
     @scrapes title price description address image_urls
     @scrapes url project spider server date
     """
     # Create the loader using the response
     l = ItemLoader(item=PropertiesItem(), response=response)
     # Load fields using XPath expressions
     l.add_xpath('title', '//div[@class="list-info"][1]/h2[@class="title"]/a/text()')
     l.add_xpath('price', '//p[@class="sum"][1]/b/text()')
     l.add_xpath('description', '//div[@class="list-info"][1]/p[@class="baseinfo"][1]//text()',
                 MapCompose(str.strip), Join())
     l.add_xpath('address', '//div[@class="list-info"][1]/p[@class="baseinfo"][2]/span//text()',
                 MapCompose(str.strip), Join())
     l.add_xpath('image_urls', '//div[@class = "pic"][1]/a/img/@src')
     # Housekeeping fields
     l.add_value('url', response.url)
     l.add_value('project', self.settings.get('BOT_NAME'))
     l.add_value('spider', self.name)
     l.add_value('server', socket.gethostname())
     l.add_value('date', datetime.datetime.now())
     return l.load_item()
Esempio n. 13
0
    def parse_hotel(self, response):
        sel = Selector(response)

        item = ItemLoader(Hotel(), sel)
        item.add_xpath('nombre', '//h1[@id="HEADING"]/text()')

        item.add_xpath(
            'precio',
            '//div[@class="ui_columns is-mobile is-multiline is-vcentered is-gapless-vertical _2mWM5u8t" or @class= "ui_columns is-gapless is-mobile"]//div[contains(text(),"$")]'
        )

        item.add_xpath(
            'descripcion',
            '//div[contains(@data-ssrev-handlers,"load") and contains(@data-ssrev-handlers,"Description")]/div[1]/div[contains(text(),"")]'
        )

        item.add_xpath(
            'amenities',
            '//div[contains(@data-ssrev-handlers,"amenities")]//text()')
        yield item.load_item()


# EJECUCION
# scrapy runspider 1_tripadvisor.py -o tripadvisor.csv -t csv
Esempio n. 14
0
    def parse(self, response):

        # Create the loader using the response
        l = ItemLoader(item=ScrapyLeoItem(), response=response)

        # Load fields using XPath expressions

        # page_url
        l.add_value('page_url', response.url)

        # rental_or_monthly
        if response.url[-20] == "r":
            l.add_value('rental_or_monthly', 'rental')
        else:
            l.add_value('rental_or_monthly', 'monthly')

        # leo_or_par
        if response.url[-18] == 0:
            l.add_value('leo_or_par', 'leo')
        else:
            l.add_value('leo_or_par', 'par')

        # address
        if len(response.xpath('.//td[@colspan="2"][2]/text()')) == 1:
            l.add_xpath('address', './/td[@colspan="2"][2]/text()')
        else:
            l.add_xpath(
                'address',
                '//*[@id="inquiry-form"]/div[1]/div/table/tbody/tr/td[2]/text()'
            )

        # mail_box
        l.add_xpath('mail_box', './/ul[2]/li[8]/span/@class',
                    MapCompose(lambda i: i.replace('sprite ico ', '')))

        return l.load_item()
Esempio n. 15
0
 def parse_review_container(self, response, **kwargs):
     film_id = kwargs['film_id']
     rv_containers = response.xpath('/html/body/div[1]/div[1]/div')
     xpath = '/html/body/div[1]/div[1]/div[{0}]/{1}'
     for idx in range(1, len(rv_containers) + 1):
         l = ItemLoader(item=ReviewItem(), response=response)
         l.add_value('film_id', film_id)
         l.add_xpath(
             'user_id',
             xpath.format(idx, 'div[1]/div[1]/div[2]/span[1]/a/@href'))
         l.add_xpath('comment_id', xpath.format(idx,
                                                'div[1]/div[1]/a/@href'))
         l.add_xpath(
             'date', xpath.format(idx,
                                  'div[1]/div[1]/div[2]/span[2]/text()'))
         l.add_xpath(
             'star_rating',
             xpath.format(idx, 'div[1]/div[1]/div[1]/span/span[1]/text()'))
         l.add_xpath('title', xpath.format(idx, 'div[1]/div[1]/a/text()'))
         l.add_xpath(
             'content',
             xpath.format(
                 idx, 'div[1]/div[1]/div[@class="content"]/div[1]/text()'))
         yield l.load_item()
Esempio n. 16
0
    def parse_content(self, response):
        def deal_publish_time(publish_time_list):
            year = publish_time_list[0]
            mounth = publish_time_list[1]
            day = publish_time_list[2]

            if len(mounth) < 2:
                mounth = '0' + mounth
            if len(day) < 2:
                day = '0' + day
            publish_time = year + '-' + mounth + '-' + day + ' 00:00:00'
            return publish_time

        loaders1 = ItemLoader(response=response, item=YfspiderspeakItem())
        loaders1.add_value('url', response.url)
        loaders1.add_value('spider_time', time.time())
        loaders1.add_xpath('title', '//h1[@class="entry-title"]/text()')
        loaders1.add_value(
            'publish_time',
            response.xpath('//span[@class="entry-date"]').re(
                r'(\d{4}).*?(\d).*?(\d)'), deal_publish_time)
        loaders1.add_xpath(
            'content',
            '//div[contains(@id,"post")]/div[@class="entry-content"]//text()',
            Join())
        loaders1.add_value(
            'img_urls',
            response.xpath(
                '//div[contains(@id,"post")]/div[@class="entry-content"]').re(
                    r'href="([\S]*?\.jpg)"'))
        loaders1.add_xpath(
            'id', '//div[@id="content"]/div[contains(@id,"post")]/@id')

        item1 = loaders1.load_item()
        print(item1)
        return item1
Esempio n. 17
0
    def parse_item(self, response):
        """Parse an ad page with an apartment.

        @url https://www.immobilienscout24.de/expose/93354819
        @returns items 1 1
        @scrapes url title address neighborhood cold_rent warm_rent rooms
        """
        self.shutdown_on_error()
        item = ItemLoader(ApartmentItem(), response=response)
        item.add_value('url', response.url)
        item.add_css('title', 'h1#expose-title::text')

        for field, css_class in self.DIV_PRE_MAPPING.items():
            item.add_xpath(
                field,
                "//div/pre[contains(@class, '{}')]/text()".format(css_class))

        full_address = ''.join(
            response.xpath("//span[@data-qa='is24-expose-address']/div//text()"
                           ).extract()).strip()
        parts = full_address.split(self.CITY)
        if len(parts) == 1:
            item.add_value('address', full_address)
        else:
            street_zip = (parts[0] + self.CITY).strip(' ,').replace(
                ' (zur Karte) ', '')
            item.add_value('address', street_zip)
            item.add_value('neighborhood', ''.join(parts[1:]).strip(' ,'))

        item.add_css('cold_rent', 'div.is24qa-kaltmiete::text')
        item.add_css('warm_rent', 'dd.is24qa-gesamtmiete::text')
        item.add_css('rooms', 'div.is24qa-zi::text')
        item.add_xpath(
            'active', '//div[contains(@class, "status-message")]'
            '/h3[starts-with(normalize-space(.), "Angebot")]/text()')
        yield item.load_item()
    def read_news(self, response):
        cuerpoPaths = [
            '//article//section[@class="article-content"]//p/text()',
            '//article//section[@class="article-content"]/text()',
        ]

        titulo = response.xpath(self.tituloPath).get()
        fecha_publicacion = response.xpath(self.fechaPath).get()

        for path in cuerpoPaths:
            cuerpo = response.xpath(path).getall()
            if cuerpo: break  # Change until find one that works

        # Date should has format: YYYY-MM-DDTHH:MM:SS
        fecha_publicacion = self.format_fecha(fecha_publicacion)

        news = ItemLoader(item=News())
        news.add_value('titulo', titulo)
        news.add_value('cuerpo', cuerpo)
        news.add_value('fecha_publicacion', fecha_publicacion)
        news.add_value('url', response.url)
        news.add_value('diario', self.name)
        news.add_value('page', self.current_page)
        return news.load_item()
Esempio n. 19
0
    def parse(self, response):
        productos = response.css('div.product-tile-inner')
        for producto in productos:
            detalles = producto.css('div.detail')
            tiene_detalle = len(detalles) > 0
            if (tiene_detalle):
                producto_loader = ItemLoader(item=ProductoFybeca(),
                                             selector=producto)
                # producto_loader.default_output_processor=TakeFirst()

                producto_loader.add_css('titulo', 'a.name::text')

                producto_loader.add_xpath(
                    'imagen',
                    'div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src'
                )

                producto_loader.add_css(
                    'precio_normal', 'div.side > div.price::attr(data-bind)')
                producto_loader.add_css(
                    'precio_descuento',
                    'div.price-member > div::attr(data-bind)')

                yield producto_loader.load_item()
Esempio n. 20
0
    def parse(self, response):
        self.logger.info("start parese url %s" %response.url)
        for div in response.xpath('//div[@class="house-listBox"]/div'):
            l = ItemLoader(item=PropertyItem(), selector=div)
            l.default_output_processor = TakeFirst()
            l.add_xpath("title", '(.//a)[2]/text()', MapCompose(lambda x: self.spc_reg.sub("",x)))
            l.add_xpath("url", "(.//a)[2]/@href",
                        MapCompose(lambda x: urljoin(response.url,urlparse(x).path)))
            l.add_xpath("price", './/p[@class="price-nub cRed"]/text()',Join())
            l.add_xpath("address",'.//a[@class="f000 mr_10"]//text()',
                        MapCompose(lambda x: self.spc_reg.sub("",x)),Join())

            l.add_xpath("dist_name", './/p[@class="f7b mb_15"]/text()',Join(), MapCompose(lambda x: x.split("-")[0].strip()))

            l.add_xpath("subdist_name",'.//p[@class="f7b mb_15"]/text()',Join(), MapCompose(lambda x: x.split("-")[1].split()[0]))

            # housekeeping
            l.add_value("source", response.url)
            l.add_value("project", self.settings.get("BOT_NAME"))
            l.add_value("spider", self.name)
            l.add_value("server", socket.gethostname())
            l.add_value("date", datetime.datetime.now().strftime("%Y%m%d%H%M%S"))

            yield l.load_item()
Esempio n. 21
0
    def parse_item(self, response):
        item = ItemLoader(item=Jl2763Item(), response=response)
        url = response.url
        item_list = item_code(url, self.web_name, 'code=(.*?)$')
        item.add_value('web_name', self.web_name)
        item.add_value('web_code', self.name)
        item.add_value('url', url)
        item.add_value('item_code', item_list.get('item_code'))
        item.add_css('title', '.tit_left_invest::text')
        item.add_css('amount', '.dl_left_invest.width-250 span::text')
        item.add_css('rate', 'dd')
        item.add_css('period', 'dd:nth-child(2)')
        # item.add_xpath('loan_using', '//*[contains(text(),"")]/following-sibling::td[1]/text()')
        # item.add_xpath('loaner_info', '//*[contains(text(),"证件号码")]/parent::li[1]')
        item.add_css('pay_type', '.money_left_invest i::text')
        item.add_css('progress', "[src='/mdw/images/repayment_r.png']")

        # invest records
        i_v = []
        invest_records_temp = '{{username={lst[0]}|rate=-1|postmoney={lst[1]}|money={lst[1]}|postdate={lst[2]}|status=全部通过}}'
        invest_records_format = ""
        tr = response.css('div .table02_repay').css('tr')
        # print(tr)
        try:
            for i in tr:
                lst = i.css('td::text').extract()
                if lst:
                    i_v.append(lst)
            for n in i_v:
                invest_records_format += invest_records_temp.format(lst=n)
            item.add_value('invest_records', invest_records_format)
            item.add_value('start', i_v[-1][2])
            item.add_value('end', i_v[0][2])
        except Exception:
            self.logger.info('invest records is error %s' % url)
        yield item.load_item()
Esempio n. 22
0
    def parse_item(self, response):
        selector = Selector(response=response)
        selector.css('div#content div.article div.topic-content')

        item_loader = ItemLoader(item=HouseRentingDoubanItem(),
                                 selector=selector,
                                 response=response)
        item_loader.add_css(field_name='title', css='table.infobox *::text')
        item_loader.add_css(field_name='title',
                            css='div#content > h1:first-child::text')
        item_loader.add_value(field_name='source', value=self.name)
        item_loader.add_css(field_name='author', css='h3 span.from a::text')
        # item_loader.add_css(field_name='image_urls', css='div.topic-content div#link-report img::attr(src)')
        item_loader.add_css(field_name='author_link',
                            css='h3 span.from a::attr(href)')
        item_loader.add_css(field_name='content',
                            css='div.topic-content div#link-report *::text',
                            re=r'\s*(.*)\s*')
        item_loader.add_value(field_name='source_url', value=response.url)
        item_loader.add_css(field_name='publish_time',
                            css='h3 span:last-child::text',
                            re=r'\s*(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s*')

        yield item_loader.load_item()
Esempio n. 23
0
    def parse_itemsEb(self, response):
        """
        Funcion que se encarga de extraer la información de cada producto en Mercadolibre.
        """
        item = ItemLoader(Articulo(), response)

        if self.item_countE < 8:
            link = response.url
            item.add_value('store', 'Ebay')
            item.add_value('link', link)
            item.add_xpath('imageURL',
                           '//div[@id="mainImgHldr"]/img[@id="icImg"]/@src')
            item.add_xpath('name', '//h1[@id="itemTitle"]/text()[1]')
            item.add_xpath('Price', '//span[@id="prcIsum"]/text()',
                           MapCompose(self.priceCleaningE))
            item.add_xpath('description',
                           '//span[@id="vi-cond-addl-info"]/text()')
            yield item.load_item()
            self.item_countE = self.item_countE + 1
        else:
            print('Limit reached for Ebay')

            if 'ebay.com' in self.allowed_domains:
                self.allowed_domains.remove('ebay.com')
Esempio n. 24
0
    def parse_book_info(self, response: Response, short_name):
        # Get book's full name and author
        loader = ItemLoader(item=BookInfo(), response=response)
        # Find elements
        loader.add_css(FULL_NAME, BOOK_FULL_NAME_PATH)
        loader.add_css(AUTHOR, BOOK_AUTHOR_PATH)
        loader.add_css(LAST_CHAPTER, BOOK_LAST_CHAPTER_PATH)

        # Extracting data
        page = loader.load_item()
        last_chapter = int(page.get(LAST_CHAPTER))

        yield {
            SHORT_NAME: short_name,
            FULL_NAME: page.get(FULL_NAME),
            AUTHOR: page.get(AUTHOR),
            LAST_CHAPTER: last_chapter
        }

        for i in range(1, last_chapter + 1):
            yield Request(url=CHAPTER_URL.format(short_name, i),
                          callback=self.parse_chapter,
                          cb_kwargs=dict(short_name=short_name,
                                         chapter_index=i))
Esempio n. 25
0
    def parse_item(self, response):
        """ This function parses a property page

        @url http://localhost:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """
        loader = ItemLoader(item=PropertiesItem(), response=response)
        loader.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(str.strip, str.title))
        loader.add_xpath('price', '//*[@itemprop="price"][1]/text()',
                         MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+')
        loader.add_xpath('description', '//*[@itemprop="description"][1]/text()',
                         MapCompose(str.strip, lambda i: i.replace('\r\n', ' ')))
        loader.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(str.strip))
        loader.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
                         MapCompose(lambda i: parse.urljoin(response.url, i)))

        loader.add_value('url', response.url)
        loader.add_value('project', self.settings.get('BOT_NAME'))
        loader.add_value('spider', self.name)
        loader.add_value('server', socket.gethostname())
        loader.add_value('date', datetime.datetime.now())
        yield loader.load_item()
    def parse_trainer(self, response):
        """ Parse trainer page.

        @url https://www.oddspark.com/keiba/TrainerDetail.do?trainerNb=018052
        @returns items 1 1
        @returns requests 0 0
        @trainer
        """

        logger.info(f"#parse_trainer: start: url={response.url}")

        # Parse trainer
        logger.debug("#parse_trainer: parse trainer")

        loader = ItemLoader(item=TrainerItem(), response=response)
        loader.add_value("trainer_url", response.url)
        loader.add_xpath("trainer_name", "normalize-space(//div[contains(@class,'section')]/div/span[1]/text())")
        loader.add_xpath("birthday", "normalize-space(//table[contains(@class,'tb72')]/tr[1]/td/text())")
        loader.add_xpath("gender", "normalize-space(//table[contains(@class,'tb72')]/tr[2]/td/text())")
        loader.add_xpath("belong_to", "normalize-space(//table[contains(@class,'tb72')]/tr[3]/td/text())")
        i = loader.load_item()

        logger.info(f"#parse_trainer: trainer={i}")
        yield i
Esempio n. 27
0
    def parse_question(self, response):
        """处理问题函数"""
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_value('zhihu_id', response.meta.get('question_id', ''))
        item_loader.add_css('title', 'h1.QuestionHeader-title::text')
        item_loader.add_css('content', '.QuestionRichText')
        item_loader.add_value('url', response.url)
        item_loader.add_css('answer_num', '.List-headerText span::text')
        item_loader.add_css('comments_num',
                            '.QuestionHeader-Comment > button::text')
        item_loader.add_css('watch_user_num',
                            '.NumberBoard-item .NumberBoard-value::text')
        item_loader.add_css('click_num',
                            '.NumberBoard-item .NumberBoard-value::text')
        item_loader.add_css('topics', '.TopicLink .Popover div::text')

        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(
            response.meta.get('question_id', ''), 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)

        yield question_item
    def parse_odds_trifecta(self, response):
        """ Parse odds(trifecta) page.

        @url https://www.oddspark.com/keiba/Odds.do?sponsorCd=06&raceDy=20201018&opTrackCd=11&raceNb=7&betType=8&horseNb=1
        @returns items 1
        @returns requests 0 0
        @odds_trifecta
        """

        logger.info(f"#parse_odds_trifecta: start: url={response.url}")

        # Parse odds trifecta
        for tr in response.xpath("//table[@summary='odds']/tr"):
            if len(tr.xpath("th")) == 2:
                logger.debug("#parse_odds_trifecta: skip header")
            else:
                loader = ItemLoader(item=OddsTrifectaItem(), selector=tr)
                loader.add_value("odds_url", response.url)
                loader.add_xpath("horse_number", "th/text()")
                loader.add_xpath("odds", "td/span/text()")
                i = loader.load_item()

                logger.debug(f"#parse_odds_trifecta: odds trifecta={i}")
                yield i
Esempio n. 29
0
 def parse_auto_page(self, response):
     item = ItemLoader(AvitoParserItem(), response)
     item.add_xpath(
         'title',
         '//h1[@class="title-info-title"]/span[@class="title-info-title-text"]/text()'
     )
     item.add_xpath(
         'price',
         '//div[@class="item-price-value-wrapper"]//span[@class="js-item-price"]/@content'
     )
     item.add_xpath(
         'params',
         '//div[@class="item-params"]/ul[@class="item-params-list"]/li')
     item.add_xpath(
         'photos',
         '//div[contains(@class, "gallery-img-wrapper")]/div[contains(@class, "gallery-img-frame")]/@data-url'
     )
     autoteka_link_id = response.xpath(
         '//div[@class="js-autoteka-teaser"]/@data-item-id').extract_first(
         )
     autoteka_link = 'https://www.avito.ru/web/1/swaha/v1/autoteka/teaser/'
     yield response.follow(autoteka_link + autoteka_link_id,
                           callback=self.get_VIN_official,
                           meta={'item': item})
Esempio n. 30
0
    def parse(self, response):
        
        json_resp = json.loads(response.body)
        houses = json_resp.get('cat1').get('searchResults').get('listResults')

        for house in houses:
            loader = ItemLoader(item = ZillowItem())
            loader.add_value('id',house.get('id'))
            loader.add_value('image_urls',house.get('imgSrc'))
            loader.add_value('detail_url',house.get('detailUrl'))
            loader.add_value('status_type',house.get('statusType'))
            loader.add_value('status_text',house.get('statusText'))
            loader.add_value('price',house.get('price'))
            loader.add_value('address',house.get('address'))
            loader.add_value('beds',house.get('beds'))
            loader.add_value('baths',house.get('baths'))
            loader.add_value('area_sqft',house.get('area'))
            loader.add_value('latitude',house.get('latLong').get('latitude'))
            loader.add_value('longitude',house.get('latLong').get('longitude'))
            loader.add_value('broker_name',house.get('brokerName'))
            yield loader.load_item()

        current_page = response.meta['currentPage']
        total_pages = json_resp.get('cat1').get('searchList').get('totalPages')
        
        if current_page <= total_pages:
            nxt_pg = current_page + 1

            yield scrapy.Request(
                url= parse_new_url(URL,pg_num=nxt_pg),
                callback=self.parse,
                cookies=cookie_parser(),
                meta={
                    'currentPage': nxt_pg
                }
            )