Beispiel #1
0
    def parse_item(self, response):
        """ This function parses a property page.

        @url http://web:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        l.add_xpath("title", '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "price", './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(",", ""), float), re="[,.0-9]+"
        )
        l.add_xpath("description", '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join())
        l.add_xpath("address", '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip))
        l.add_xpath(
            "image_urls", '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i))
        )

        # Housekeeping fields
        l.add_value("url", response.url)
        l.add_value("project", self.settings.get("BOT_NAME"))
        l.add_value("spider", self.name)
        l.add_value("server", socket.gethostname())
        l.add_value("date", datetime.datetime.now())

        return l.load_item()
def parse_link_page(response):
    for post in response.xpath('//div[@data-type="link"]'):
        l = ItemLoader(RedditPostItem(), selector=post)
        post_root_xpath = './div[contains(@class, "entry")]'
        title = post_root_xpath + '/p[@class="title"]'
        tagline = post_root_xpath + '/p[@class="tagline"]'
        buttons = post_root_xpath + '/ul'
        l.add_xpath('title', title + '/a/text()')
        l.add_xpath('link', title + '/a/@href')
        l.add_xpath('poster', tagline + '/a[contains(@class, "author")]/text()')
        l.add_xpath('score', './div[contains(@class, "midcol")]/div[@class="score unvoted"]/text()')
        l.add_xpath('number_of_comments', buttons + '//a[contains(@class, "comments")]/text()')
        l.add_xpath('comments_link', buttons + '//a[contains(@class, "comments")]/@href')
        l.add_xpath('subreddit', './@data-subreddit')
        l.add_xpath('post_timestamp', tagline + '/time/@datetime')
        l.add_value('scrape_timestamp', datetime.datetime.now())

        item = l.load_item()
        # if there are any comments for the post, go scrape them
        item["comments"] = []
        if item["number_of_comments"] > 0:
            yield scrapy.Request(item["comments_link"]+"?limit=500",
                                 callback=parse_comments,
                                 meta={'item': item})
        yield l.load_item()
Beispiel #3
0
    def parse(self, response):
        l=ItemLoader(item=RentalItem(),response=response)
        l.add_xpath('price','//*[(@id = "main-info")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-big", " " )) and contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()')
        l.add_xpath('adress','//*[(@id = "addressPromo")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()')
        l.add_value('url', response.url)

        return l.load_item()
    def parse_item(self, response):
        # FIXME: fix array issue
        i = ItemLoader(item=SalefinderItem(), response=response)
        title = r'//div[@id="product-details-container"]//h1/text()'
        price = r'//div[@id="product-details-container"]//span[@class="price"]/text()'
        per = r'//div[@id="product-details-container"]//span[@class="price"]/text()'
        image_url = r'//a[@id="product-image-container"]//img/@src'

        i.add_xpath('title', title, MapCompose(unicode.lower))
        i.add_xpath('price', price, re=r'[,.0-9]+')
        i.add_xpath('per', per, re=r'pk|each|kg')
        i.add_xpath('image_url', image_url)

        i.add_value('url', response.url)
        i.add_value('date', date.today().isoformat())

        product_buy = response.xpath("//div[@class='product-container']//div[@id='product-buy']")
        product_buy_text = product_buy.extract_first().lower()

        # Detect the vendor from a product-buy div
        if 'coles' in product_buy_text:
            i.add_value('vendor', 'coles')
        elif 'woolworths' in product_buy_text:
            i.add_value('vendor', 'woolworths')
        else:
            i.add_value('vendor', 'unknown')
        return i.load_item()
Beispiel #5
0
    def parse_content_page(self, response):

        # Detect if this is a redirection page
        m = redirect_re.search(response.body)
        if m:
            import requests
            new_url = m.group(1)
            new_content = requests.get(new_url).content
            response = scrapy.http.HtmlResponse(new_url, body=new_content)

        # Start scraping
        il = ItemLoader(item = LuliItem(), response=response)
        
        il.add_css('content', 'div#articleNew > p::text')
        il.add_css('content', 'div[itemprop="articleBody"] > p')
        
        il.add_css('date', 'div#articleDate::text')
        il.add_css('date', 'header > time[datetime]::attr(datetime)')
        
        il.add_css('title', 'div#articleNew > h1::text')
        il.add_css('title', 'h1[itemprop="headline"]::text')
        
        il.add_value('url', response.url)

        item = il.load_item() 
        yield item
Beispiel #6
0
    def parse(self, response):
        """ This function parses the categories and its subcategories on a gscholar web page.

        @url https://scholar.google.com/citations?view_op=top_venues&hl=de&vq=bus
        @returns items 1 1
        @returns requests 0 0
        @scrapes name subs
        """

        # We need the div that is 'selected' i.e. contains gs_sel as a css class
        title_xp = '//*[@id="gs_m_broad"]/div[contains(@class,\'gs_sel\')]/a/span/text()'

        item = ItemLoader(item=CategoryItem(), response=response)
        title = response.xpath(title_xp).extract_first()

        item.add_value('name', title)
        subs = []
        for sub in response.xpath('//*[@id="gs_m_rbs"]/ul/li/a'):
            s = {'name' : sub.xpath('text()').extract_first()}
            rel_url = sub.xpath('@href').extract_first()
            s['vq'] = parse_qs(urlparse(rel_url).query)[u'vq'][0]
            subs.append(s)
            req = Request(urljoin(response.url,rel_url), callback=self.parse_item)
            req.meta['parent'] = title
            yield req
        item.add_value('subs', subs)
        yield item.load_item()
Beispiel #7
0
    def parse(self, response):
        sites = response.xpath('//table/tbody/tr')
        for site in sites:

            url = urljoin(response.url, site.xpath("td[2]/a/@href").extract_first())
            urlLast = urljoin(response.url, site.xpath("td[3]/a/@href").extract_first())
            item = DeathItem()
            loader = ItemLoader(item,selector=site)
            loader.add_xpath('Mid','td[1]/text()')
            loader.add_xpath('firstName','td[5]/text()')
            loader.add_xpath('lastName','td[4]/text()')
            loader.add_xpath('Date','td[8]/text()')
            loader.add_xpath('Race','td[9]/text()')
            loader.add_xpath('County','td[10]/text()')
            loader.add_xpath('Age','td[7]/text()')
            loader.add_value('OILink',url)
            loader.add_value('OLastStatement',urlLast)

 
            if url.endswith(("jpg","no_info_available.html")):
                loader.add_value('Description',u'')
                loader.add_value('Education',u'')
                if urlLast.endswith("no_last_statement.html"):
                    loader.add_value('Message',u'')
                    yield loader.load_item()
                else:
                    request = scrapy.Request(urlLast, meta={"item" : loader.load_item()}, callback =self.parse_details2)
                    yield request
            else:        
                request = scrapy.Request(url, meta={"item": loader.load_item(),"urlLast" : urlLast}, callback=self.parse_details)
                yield request
Beispiel #8
0
    def parse_image(self, response):
        logger.info("正在收集页面数据: %s ..." % response.url)
        loader = ItemLoader(item=MeiTuItem(), response=response)

        loader.add_xpath('publisher', "//div[@class='width']/div[@class='c_l']/p[1]/text()")
        loader.add_xpath('publisher', "//div[@class='width']/div[@class='c_l']/p[1]/a[@class='tags']/text()")
        loader.add_xpath('model_name', "//div[@class='width']/div[@class='c_l']/p[5]/text()")
        loader.add_xpath('model_name', "//div[@class='width']/div[@class='c_l']/p[5]/a[@class='tags']/text()")
        loader.add_xpath('publishtime', "//div[@class='width']/div[@class='c_l']/p[6]/text()")
        loader.add_xpath('magazine_no', "//div[@class='width']/div[@class='c_l']/p[2]/text()")
        loader.add_xpath('pic_qty', "//div[@class='width']/div[@class='c_l']/p[3]/text()")
        loader.add_xpath('pixel', "//div[@class='width']/div[@class='c_l']/p[4]/text()")

        try:
            loader.add_xpath('desc', "//p[@class='buchongshuoming'/text()]")
        except ValueError:
            pass

        loader.add_xpath('tag', "//div[@class='fenxiang_l']/a[@class='tags']/text()")
        loader.add_xpath('sort', "//div[@class='weizhi']/span/a[2]/text()")
        loader.add_xpath('image_url', "//div[@class='content']/center/img[@class='content_img']/@src")
        loader.add_value("page_url", response.url)


        yield loader.load_item()
Beispiel #9
0
    def parse_item(self, response):
        sel = response.css("div.path")

        loader = ItemLoader(item=SeriesItem(), selector=sel)
        loader.add_css("series_id", "a:last-child::attr(href)")
        loader.add_css("series_name", "a:last-child::text")

        series = loader.load_item()
        print(series)

        # 即将销售 & 在售
        for sel in response.css("div.interval01-list-cars-infor"):
            loader = ItemLoader(item=ModelItem(), selector=sel)
            loader.add_css("model_id", "a::attr(href)")
            loader.add_css("model_name", "a::text")
            loader.add_value("series_id", series['series_id'])
            loader.add_value("series_name", series['series_name'])

            yield loader.load_item()

        # 停售
        url = "http://www.autohome.com.cn/ashx/series_allspec.ashx"

        years = response.css(".dropdown-content a::attr(data)")

        for year in years.extract():
            qs = {
                "y": year,
                "s": series["series_id"]
            }

            yield Request(url + "?" + urlencode(qs), self.stop_sale)
Beispiel #10
0
    def parse_colleagues(self, response, author_id):
        self.logger.info('Parsing colleagues for author %s.' % author_id)

        # get all authors listed
        num_authors = 0
        for div in response.xpath('//*[@class="gsc_1usr gs_scl"]'):
            num_authors += 1
            name_xp = './*[@class="gsc_1usr_name"]/text()'
            id_val = urlparse.parse_qs(urlparse.urlparse(div.xpath('//*[@id="gsc_ccl"]/div[1]/div[2]/h3/a/@href').extract_first()).query)['user']
            cited_by_xp = './*[@class="gsc_1_usr_cby"]/text()'
            fos_xp = './/a[@class="gsc_co_int"]/@href' # --> ["foo", "bar",...]

            # load general author item for colleague
            co_auth = ItemLoader(item=AuthorItem(), response=response, selector=div)
            co_auth.add_value('id', id_val)
            co_auth.add_xpath('name', name_xp)
            co_auth.add_xpath('cited', cited_by_xp)
            co_auth.add_xpath('fos', fos_xp)
            yield co_auth.load_item()

            # load co-authorship
            relation = [author_id, id_val]
            relation.sort()
            co_rel = ItemLoader(item=CoAuthorItem(), response=response)
            co_rel.add_value('author1', relation[0])
            co_rel.add_value('author2', relation[1])
            yield co_rel.load_item()

        self.logger.info('Found %d colleagues for author %s.' % (num_authors, author_id))

        next_url = self.choose_next()

        if next_url:
            yield Request(url=next_url)
Beispiel #11
0
 def _parse(self, response):
     l = ItemLoader(item=BookmarksItem(), response=response)
     l.add_xpath(u"name", u"/html/head/title")
     l.add_xpath(u"anchors", u"//a/@href'")
     l.add_xpath(u"description", u"/html/body/text()")
     l.add_value(u"last_updated", datetime.datetime)  # you can also use literal values
     return l.load_item()
		def parse_item(self,response):
			l = ItemLoader(item =MeizituItem(),response = response)
			l.add_xpath('name','//h2/a/text()')
			l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
			l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",Identity())
			l.add_value('url', response.url)
			return l.load_item()
Beispiel #13
0
	def parse(self, response):
		match = re.search('/displaySeminarList/',response.url)

		if match:
			urls = response.xpath('//div[@class="internList splitEntry"]//@href').extract()
			for url in urls:
				url = response.urljoin(url)
				yield scrapy.Request(url, self.parse)
		else:
			table = response.xpath(self.seminar_list_xpath)
			corpId = parse_qs(urlparse(response.url).query)['corpId']
			for index,semi in enumerate(table):
				loader = ItemLoader(SeminarItem(),semi)
				loader.default_input_processor = MapCompose(unicode.strip)
				loader.default_output_processor = Join()
				loader.add_value('companyid',corpId)
				loader.add_xpath('name','//div[@id="headerWrap"]//h3/text()')
				loader.add_xpath('date','.//td[@class="date"]/text()',re='\d+\/\d+\/\d+')
				loader.add_xpath('time','.//td[@class="time"]/text()')
				loader.add_xpath('area','.//td[@class="area"]/text()')
				loader.add_xpath('place','.//td[@class="place"]/text()')
				loader.add_xpath('loc_n','.//td[@class="place"]//a', re='mycom_loc\|(\d+\/\d+\/\d+\.\d+)\,\d+\/\d+\/\d+\.\d+')
				loader.add_xpath('loc_e','.//td[@class="place"]//a', re='mycom_loc\|\d+\/\d+\/\d+\.\d+\,(\d+\/\d+\/\d+\.\d+)')
				loader.add_xpath('target','.//td[@class="target"]/text()')
				yield loader.load_item()
    def parse_info(self, response):

        loaderJob = ItemLoader(item=JobInfoItem(), response=response)
        loaderCom = ItemLoader(item=ComInfoItem(), response=response)
        loaderJob.add_value('url', value=response.url)
        loaderJob.add_xpath('job_name', '//div[@class="inner-left fl"][1]/h1/text()', TakeFirstL())
        loaderJob.add_xpath('job_company', '//div[@class="inner-left fl"][1]/h2/a/text()', TakeFirstL())
        loaderJob.add_xpath('job_benefits', '//div[@class="inner-left fl"][1]/div/span/text()', JoinL('|'))
        divs = '//ul[@class="terminal-ul clearfix"]/li'
        loaderJob.add_xpath('job_salary', divs, TakeFirstL(), re=u'(?<=职位月薪:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_location', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=工作地点:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_update', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=发布日期:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_nature', divs, TakeFirstL(), re=u'(?<=工作性质:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_experience', divs, TakeFirstL(), re=u'(?<=工作经验:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_miniEdu', divs, TakeFirstL(), re=u'(?<=最低学历:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_recruNums', divs, TakeFirstL(), re=u'(?<=招聘人数:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_category', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=职位类别:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_desc', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), StripBlankL(), JoinL('|'))
        loaderJob.add_xpath('job_desc_resp', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), TakeFirstL(), re=u'(?<=岗位职责|工作职责).*?(?=任职资格|岗位要求)')
        loaderJob.add_xpath('job_desc_req', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), TakeFirstL(), re=u'(?<=任职资格|岗位要求).*?(?=。)')
        loaderJob.add_xpath('job_desc_loc', '//div[@class="tab-inner-cont"][1]/h2/text()', TakeFirstL())

        loaderCom.add_xpath('url', '//div[@class="company-box"]/p[@class="company-name-t"]/a/@href', TakeFirstL())
        loaderCom.add_xpath('com_name', '//div[@class="company-box"]/p[@class="company-name-t"]/a/text()', TakeFirstL())
        divs = '//div[@class="company-box"]/ul/li'
        loaderCom.add_xpath('com_size', divs, ExtractTextL(), TakeFirstL(),  re=u'(?<=公司规模[:,:]).*')
        loaderCom.add_xpath('com_nature', divs, ExtractTextL(), TakeFirstL(),  re=u'(?<=公司性质[:,:]).*')
        loaderCom.add_xpath('com_industry', divs, ExtractTextL(), TakeFirstL(),  re=u'(?<=公司行业[:,:]).*')
        loaderCom.add_xpath('com_intro', '//div[@class="tab-inner-cont"][2]', ExtractTextL(), StripBlankL(), JoinL('|'))
        loaderCom.add_xpath('com_link', divs, ExtractTextL(), TakeFirstL(),  re=u'(?<=公司主页[:,:]).*')
        loaderCom.add_xpath('com_address', divs, RemoveTagsL(), TakeFirstL(),  re=u'(?<=公司地址[:,:])[\s\S]*(?=</strong>)')

        return loaderJob.load_item(), loaderCom.load_item()
    def get_player_info(self, response):
        loader = ItemLoader(item=NFL_Player_2015(), response=response)
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()[0]
        number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()

        if type(number_and_position) is list:
            number_and_position = number_and_position[0]
            number = number_and_position.split()[0]
            position = number_and_position.split()[1]
        else:
            number = ''
            position = ''

        loader.add_value('number', number)
        loader.add_value('position', position)
        loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
        loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')

        # loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
        # loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')

        yield loader.load_item()
Beispiel #16
0
	def parse_item(self, response):
		"""
		This function parses a property page.

		@url http://localhost:9312/properties/property_000000.html
		@returns items 1
		@scrapes title price description address image_urls
		@scrapes url project spider server date
		"""
		l = ItemLoader(item=PropertiesItem(), response=response)
		l.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title))
		l.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float),
					re='[,.0-9]+')
		l.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join())
		l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip))
		l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
					MapCompose(lambda i: urlparse.urljoin(response.url, i)))

		# Housekeeping fields
		l.add_value('url', response.url)
		l.add_value('project', self.settings.get('BOT_NAME'))
		l.add_value('spider', self.name)
		l.add_value('server', socket.gethostname())
		l.add_value('date', datetime.datetime.now())
		return l.load_item()
 def parse_titles(self, response):
     loader = ItemLoader(item=BlogCategory(), response=response)
     loader.add_value('hub', response.meta['hname'])
     loader.add_css('title', 'div.company_post h1 span::text')
     loader.add_css('date', 'div.published::text')
     loader.add_css('article', 'div.content::text')
     yield loader.load_item()
Beispiel #18
0
def parse(self, response):
    l = ItemLoader(item=Product(), response=response)
    l.add_xpath('name', '//div[@class="product_name"]')
    l.add_xpath('name', '//div[@class="product_title"]')
    l.add_xpath('price', '//p[@id="price"]')
    l.add_css('stock', 'p#stock]')
    l.add_value('last_updated', 'today')
    return l.load_item()
Beispiel #19
0
 def parse_group_home_page(self, response):
     i = ItemLoader(item=DoubanGroupItem(), response=response)
     i.add_xpath('group_name', self._x_query['name'])
     i.add_value('group_url', response.url)
     i.add_xpath('group_members', self._x_query['members'], re='\((\d+)\)')
     i.add_xpath('relative_groups', self._x_query['relative_groups'])
     i.add_xpath('friend_groups', self._x_query['friend_groups'])
     return i.load_item()
Beispiel #20
0
 def parse_item(self,response):
 	l=ItemLoader(item=AskspiderItem())
 	l.add_xpath('q_title',"//h1[@class='ask_title']/text()",MapCompose(unicode.strip),Join())
 	l.add_xpath('q_time',"//span[@class='ask_time']/text()",MapCompose(unicode.strip))
 	l.add_xpath('q_province',"//div[@class='abouttdd']/ul/li[1]/h3/span/text()",MapCompose(unicode.strip))
 	l.add_value('q_link',response.url)
 	l.add_xpath('q_user',"//a[@class='ask_username']/text()")
 	return l.load_item()
Beispiel #21
0
    def parse(self, response):
        for item in self.find_items(response):
            loader = ItemLoader(item=self.item_class())
            for target in self.get_targets():
                loader.add_value(target.name, target.get_value(item, response))

            val = self.Meta.detail_path.get_value(item, response)
            yield gen_request(val, self.parse_details, loader.load_item())
Beispiel #22
0
 def test_load_item_using_default_loader(self):
     i = TestItem()
     i['summary'] = u'lala'
     il = ItemLoader(item=i)
     il.add_value('name', u'marta')
     item = il.load_item()
     assert item is i
     self.assertEqual(item['summary'], u'lala')
     self.assertEqual(item['name'], [u'marta'])
Beispiel #23
0
    def parse_content(self, response):
        logger.info('Dealing with images: %s', response.url)
        item_load = ItemLoader(item=ScrapyMeizituItem(), response=response)
        item_load.add_value('url', response.url)
        item_load.add_xpath('name', self._x_query['name'])
        item_load.add_xpath('tags', self._x_query['tags'])
        item_load.add_xpath('image_urls', self._x_query['image_urls'])

        return item_load.load_item()
Beispiel #24
0
    def parse(self, response):
        item = ItemLoader(item=OrgItem(), response=response)
        item.add_value('id', self.curr)
        item.add_xpath('name', '//h2[@class="gsc_authors_header"]/text()')
        yield item.load_item()
        next_url = self.next_label_from_db()

        if next_url:
            yield Request(url=next_url,dont_filter=True)
Beispiel #25
0
    def parse_content(self,response):
        bbsItem_loader = ItemLoader(item=BbsDmozItem(),response = response)
        url = str(response.url)
        bbsItem_loader.add_value('url',url)
        bbsItem_loader.add_xpath('forum',self._x_query['forum'])
        bbsItem_loader.add_xpath('poster',self._x_query['poster'])
        bbsItem_loader.add_xpath('content',self._x_query['page_content'])

        return bbsItem_loader.load_item()
Beispiel #26
0
    def parse_detail(self, response):
        il = ItemLoader(NewsItem(), response=response)

        il.add_css("title", "%s::text" % self.title)
        il.add_css("date", "%s::text" % self.date)
        il.add_css("auth", "%s::text" % self.auth)
        il.add_css("content", "%s > p::text" % self.content)
        il.add_value("cate", response.meta["cate"])
        return il.load_item()
Beispiel #27
0
 def parse_member(self, response):
     loader = ItemLoader(item=MemberItem(), response=response)
     matchs = re.search(r'idDiputado%3D(\d+)', response.url)
     loader.add_value('id', matchs.groups()[0])
     loader.add_xpath('name', '//div[@class="nombre_dip"]/text()')
     loader.add_xpath('term', '//div[@id="curriculum"]/div[@class="principal"]/text()')
     loader.add_xpath('province', '//div[@class="texto_dip"]/ul/li/div[@class="dip_rojo"]/text()')
     loader.add_xpath('party', '//div[@class="texto_dip"]/ul/li/div[@class="dip_rojo"]/a/text()')
     yield loader.load_item()
Beispiel #28
0
    def parse_item(self, response):

        loader = ItemLoader(EolZhiyeItem(), response)
        loader.add_value('url', response.url)
        loader.add_value('code', response.url, re=r'/(\w+)\.shtml')
        loader.add_css('name', 'h1#pagetitle::text')
        loader.add_xpath('category', u'//div[@id="precontent"]/p[contains(., "行业")]/a/text()')
        loader.add_xpath('category2', u'//div[@id="precontent"]/p[contains(., "职业")]/a/text()')
        loader.add_xpath('detail', u'//div[@id="precontent"]/following-sibling::node()[not(self::table)]', Join('\n'))
        yield loader.load_item()
Beispiel #29
0
 def parse_item(self,response):
     sel = Selector(response)
     l = ItemLoader(item =CarsItem(),response=response)
     name = sel.xpath('//div[@class="cartab-title"]/h2/a/text()').extract()[0].encode('utf-8')
     l.add_value('name',name)
     for link in sel.xpath('//a[@target="_blank"]/img/@src').extract():
         link = link.replace('t_','u_')
         l.add_value('image_urls',link)
        # print link
     return l.load_item()
Beispiel #30
0
	def parse_item_yj(self,response):
		l=ItemLoader(item=YjspiderItem(),response=response)
		l.add_xpath('yj_title',"//div[@class='ctd_head_left']/h2/text()",MapCompose(unicode.strip),Join())
		l.add_xpath('yj_time',"//div[@class='w_journey']/dl/dt/span[2]/text()",MapCompose(unicode.strip))
		l.add_value('yj_link',response.url)
		l.add_xpath('yj_looknum',"//a[@class='link_browse']/span/text()")
		l.add_xpath('yj_pl',"//a[@class='link_comment']/span/text()")
		l.add_xpath('yj_author',"//a[@id='authorDisplayName']/text()",MapCompose(unicode.strip))
		l.add_xpath('yj_province',"//div[@class='breadbar_v1 cf']/ul/li[4]/a/text()")
		return l.load_item()
Beispiel #31
0
    def parse_detail(self, response):
        # article_content = response.css('.article_content #content').extract()
        # article_content = response.xpath('//div[@class="article_content"]/div[@id="content"]').extract()
        # original_url = response.css('.article_detail a::attr(href)').extract_first()
        # # original_url = response.xpath('//div[@class="article_detail"]/a/@href').extract_first()
        tags = response.css('.article_more a::text').extract()
        if tags:
            tags = tags
        else:
            tags = '无'
        # # tags = response.xpath('//*[@class="article_more"]/a/text()').extract()

        item_loader = ItemLoader(item=ZakerItem(),
                                 response=response,
                                 dont_filter=True)

        item_loader.add_value('url_id', get_md5(response.url))
        item_loader.add_value('article_url', response.url)
        item_loader.add_value('title', response.meta.get('title'))
        item_loader.add_value('media', response.meta.get('media'))
        item_loader.add_value('comments_num',
                              response.meta.get('comments_num'))
        item_loader.add_value('img_url', response.meta.get('img_url'))
        item_loader.add_css('article_content', '.article_content #content')
        item_loader.add_css('original_url', '.article_detail a::attr(href)')
        item_loader.add_value('tags', tags)
        # item_loader.add_value('parse_time', datetime.datetime.now())
        article_item = item_loader.load_item()
        yield article_item
Beispiel #32
0
 def parse_item(self, response):
     loader = ItemLoader(item=SpiderItem(), response=response)
     content = ''
     try:
         title = response.xpath(
             r'//*[@class="dianzititle"]//text()').extract()
         date = response.xpath(
             r'//*[@id="InfoPickFromFieldControl"]//text()').extract_first(
             )
         match = re.search(r'([0-9-]+)', date)
         if match:
             date = match.group(1)
         else:
             date = '1970-01-01'
         content = response.xpath(
             r'//*[@id="FreePlaceHoldersControl1"]//text()').extract()
         loader.add_value('date', date)
         loader.add_value('title', title)
         loader.add_value('content', content)
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' %
                           (response.url, e))
         loader.add_value('date', '1970-01-01')
         loader.add_value('title', 'unknown')
         loader.add_value('content', '')
     finally:
         self.logger.info("crawled url: %s" % response.url)
         loader.add_value('url', response.url)
         loader.add_value('collection_name', self.name)
         loader.add_value("website", self.website)
         if content == '':
             self.logger.warning(' url: %s msg: %s' %
                                 (response.url, ' content is None'))
         yield loader.load_item()
Beispiel #33
0
    def get_news(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        l.add_value('title',
                    response.xpath('//h1[@class="title"]/text()').extract())
        l.add_value(
            'title',
            response.xpath(
                '//span[@class="articletitle_p22"]/text()').extract())
        l.add_value('title',
                    response.xpath('//h1[@class="tit_h2"]/text()').extract())
        l.add_value(
            'title',
            response.xpath('//span[@class="gog_title"]/text()').extract())
        l.add_value(
            'title',
            response.xpath('//td[@class="gog_title"]/text()').extract())

        l.add_value('date',
                    response.xpath('//div[@class="info"]/text()').extract())
        l.add_value(
            'date',
            response.xpath('//span[@class="p12 LightGray2"]/text()').extract())
        l.add_value(
            'date',
            response.xpath('//div[@class="articletime"]/text()').extract())
        l.add_value(
            'date',
            response.xpath('//body/table[5]/tr[5]/td[2]/div/text()').extract())
        l.add_value(
            'date',
            response.xpath(
                '//body/table[6]/tr/td/table/tr/td/table[3]/tr/td/text()').
            extract())
        r1 = r"\d{4}.\d{1,2}.\d{1,2}"
        date0 = re.compile(r1)
        date = ''.join(l.get_collected_values('date'))
        date1 = date0.findall(date)
        l.replace_value('date', date1[0])
        l.add_value(
            'content',
            response.xpath('//div[@class="content"]/p/text()').extract())
        l.add_value('content',
                    response.xpath('//td[@class="p16"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="content01 p16"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="content"]/div/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//span[@class="gog_content"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="content"]/p/a/text()').extract())
        l.add_value(
            'content',
            response.xpath('//td[@class="gog_content"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath(
                '//td[@class="gog_content"]/font/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//td[@class="p16"]/div/p/text()').extract())

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        l.add_value('website', self.website)
        return l.load_item()
Beispiel #34
0
    def parse_news(self, response):

        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        loader.add_value('media_id', self.media_id)
        loader.add_value('election_id', self.election_id)

        #parse title
        title_selectors = response.css('h1.read__title::text')
        if not title_selectors:
            return loader.load_item()
        title = title_selectors.extract_first()
        loader.add_value('title', title)

        #parse date
        date_selectors = response.css('div.read__time::text')
        if not date_selectors:
            return loader.load_item()
        date_str = date_selectors.extract_first()
        # eg: Kompas.com - 10/10/2017, 13:37 WIB
        time_arr = filter(None, re.split('[\s,-]', date_str))[1:3]
        info_time = ' '.join([_(s) for s in time_arr if s])

        #parse date information
        try:
            published_at_wib = datetime.strptime(info_time, '%d/%m/%Y %H:%M')
        except ValueError as e:
            raise CloseSpider('cannot_parse_date: %s' % e)

        #convert to utc+0
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        #parse author name
        author_name_selectors = response.css(
            'div.read__author::text').extract_first()
        if not author_name_selectors:
            loader.add_value('author_name', 'N/A')
        else:
            author_name = author_name_selectors
            loader.add_value('author_name', author_name)

        #parse raw content
        raw_content_selectors = response.css('div.read__content')
        if not raw_content_selectors:
            return loader.load_item()
        raw_content = raw_content_selectors.extract_first()
        loader.add_value('raw_content', raw_content)

        return loader.load_item()
Beispiel #35
0
    def parse_activity(self, response):
        activity = response.xpath(
            'normalize-space(.//h4[@class="ct-u-marginBottom20"])')
        company_names = response.xpath(
            './/*[contains(@class, "ct-product--tilte")]')
        company_addrs = response.xpath(
            './/*[contains(@class, "ct-product--description")]')
        company_contacts = response.xpath(
            './/div[contains(@id, "coordonnees")]')
        company_websites = response.xpath(
            './/*[contains(concat( " ", @class, " " ), concat( " ", "ct-product--description", " " ))]//a'
        )

        for (name, addr, contact, website) in zip(company_names, company_addrs,
                                                  company_contacts,
                                                  company_websites):
            items = ItemLoader(item=YellowPageCrawlerItem())

            # Activity denomination
            activity_name = activity.extract_first()
            items.add_value('activity', activity_name)

            # Name of the entity
            company_name = name.xpath(
                'normalize-space(./text())').extract_first()
            items.add_value('name', company_name)
            # Address
            address = addr.xpath('./text()').getall()
            items.add_value('address', address)
            # Contact = Mail + Phone
            contact = contact.css('::text').getall()
            # Phone
            items.add_value('phone', contact)
            # Mail
            items.add_value('mail', contact)
            # Website
            company_website = website.css('::text').getall()
            items.add_value('website', company_website)

            yield items.load_item()

        next_page = response.css('a[aria-label=Next]::attr(href)').get()

        if next_page:
            yield SplashRequest(
                url=urljoin(self.base_url, next_page),
                callback=self.parse_activity,
                endpoint='execute',
                args={
                    'lua_source': script,
                    'timeout': 10,
                    'wait': 10
                },
            )
Beispiel #36
0
    def parse_item(self, response):
        print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
        l = ItemLoader(item=PlainItem(), response=response)
        l.add_value('url', response.url)
        try:
            l.add_xpath('name', '/html/body/div[4]/div/div[1]/h1/text()')
        except:
            l.add_value('name', '')

        try:
            l.add_xpath('address', '/html/body/div[4]/div/div[1]/div/text()')
        except:
            l.add_value('address', '')

        try:
            l.add_xpath('build_year', '/html/body/div[6]/div[2]/div[2]/div[1]/span[2]/text()')
        except:
            l.add_value('build_year', '')

        try:
            l.add_xpath('buildings', '/html/body/div[6]/div[2]/div[2]/div[6]/span[2]/text()')
        except:
            l.add_value('buildings', '')

        try:
            l.add_xpath('familys', '/html/body/div[6]/div[2]/div[2]/div[7]/span[2]/text()')
        except:
            l.add_value('familys', '')

        try:
            l.add_xpath('area', '/html/body/div[5]/div[1]/a[3]/text()')
        except:
            l.add_value('area', '')
            
        try:
            l.add_value('subway', response.meta['subway'])
        except:
            l.add_value('subway', '')

        try:
            l.add_xpath('price', '/html/body/div[6]/div[2]/div[1]/div/span[1]/text()')
        except:
            l.add_value('price', '')

        try:
            l.add_xpath('estate', '/html/body/div[6]/div[2]/div[2]/div[4]/span[2]/text()')
        except:
            l.add_value('estate', '')

        # try:
        #     details = response.xpath('//div[@class="p-parameter"]/ul[2]/*/text()').extract()
        #     for i in range(len(details)):
        #         l.add_value('item{}'.format(i), details[i])
        # except:
        #     for i in range(9):
        #         l.add_value('item{}'.format(i), '')
        yield l.load_item()
    def parse_item(self, response):
        loader = ItemLoader(item=HistoryItem(), response=response)
        loader.add_value('type', response.meta["type"])
        loader.add_value('yiji', response.meta["yiji"])
        loader.add_value('erji', response.meta["erji"])

        parser = etree.HTML(response.text)
        root = parser.xpath("//div['answer_detail']/dl/dt")[0]

        if response.meta["type"] == "单选题" or response.meta["type"] == "多选题":

            # 整理选项
            choose = []
            choose_raw = root.xpath(".//td")
            for item in choose_raw:
                choose.append(
                    util.deal_jam(
                        util.from_choose_item_get_content(
                            etree.tostring(
                                item, encoding='utf-8',
                                pretty_print=True).decode('utf-8'))))
                item.getparent().remove(item)

            # root中剔除table选项
            rm_tb_list = root.xpath(".//table")
            for rm_item in rm_tb_list:
                rm_item.getparent().remove(rm_item)
                pass

            # load
            answer_raw = response.xpath(
                "//div[@class='answer_detail']/dl/dd/p[1]/i/text()"
            ).extract_first()
            answer_list = [s_item for s_item in answer_raw]
            answer_index_list = list()

            for answer in answer_list:
                if answer == "A" or answer == "a":
                    answer_index_list.append(0)
                elif answer == "B" or answer == "b":
                    answer_index_list.append(1)
                elif answer == "C" or answer == "c":
                    answer_index_list.append(2)
                elif answer == "D" or answer == "d":
                    answer_index_list.append(3)

            loader.add_value('answer_index', answer_index_list)
            loader.add_value(
                'content',
                util.from_content_get_real_content(
                    etree.tostring(root,
                                   encoding='utf-8',
                                   pretty_print=True,
                                   method='html').decode('utf-8')))
            loader.add_value('choose', choose)
            pass
        else:
            loader.add_value(
                'content',
                util.from_content_get_real_content(
                    etree.tostring(root,
                                   encoding='utf-8',
                                   pretty_print=True,
                                   method='html').decode('utf-8')))
            loader.add_value('choose', None)
            pass

        loader.add_value(
            'answer',
            util.replace_i(
                response.xpath("//div[@class='answer_detail']/dl/dd/p[1]/i").
                extract_first()))

        loader.add_value(
            'analysis',
            util.get_full_analysis(
                util.replace_i(
                    response.xpath("//div[@class='answer_detail']/dl/dd/p[2]/i"
                                   ).extract_first())))
        yield loader.load_item()
        pass
    def parse(self, response):
        agent = user_agent_rotator.get_random_user_agent()
        options.add_argument(f"user-agent={agent}")
        self.driver = webdriver.Chrome(str(Path(Path.cwd(),
                                                "chromedriver.exe")),
                                       chrome_options=options)
        # self.driver = webdriver.Firefox(executable_path=str(Path(Path.cwd(), "geckodriver.exe")))
        self.driver.set_window_size(randrange(1100, 1200), randrange(800, 900))
        self.driver.get(
            "https://www.kyero.com/en/tenerife-apartments-for-sale-0l55570g1?max_price=150000&min_beds=2&min_property_size=40&sort=popularity_desc/"
        )
        sleep(2)
        body = self.driver.find_element_by_css_selector('body')
        body.send_keys(Keys.PAGE_DOWN)
        sleep(1)
        body.send_keys(Keys.PAGE_UP)
        sleep(1)
        body.send_keys(Keys.PAGE_DOWN)
        body.send_keys(Keys.HOME)

        sel = Selector(text=self.driver.page_source)

        pages = sel.xpath(
            './/span[@class="search-results__count"]/text()').extract()[0]
        pages = pages.split(" ")[0]
        pages = pages.replace(",", "")
        pages = int(pages) / 20
        pages_count = int(pages) + 1
        sleep(1)
        self.driver.quit()

        for page in range(1):
            agent = user_agent_rotator.get_random_user_agent()
            options.add_argument(f"user-agent={agent}")
            self.driver = webdriver.Chrome(str(
                Path(Path.cwd(), "chromedriver.exe")),
                                           chrome_options=options)
            self.driver.set_window_size(randrange(1100, 1200),
                                        randrange(800, 900))
            self.driver.get(
                f"https://www.kyero.com/en/tenerife-apartments-for-sale-0l55570g1?max_price=150000&min_beds=2&min_property_size=40&page={page}&sort=popularity_desc"
            )
            sleep(1)
            body = self.driver.find_element_by_css_selector('body')
            sleep(1)
            body.send_keys(Keys.END)
            sleep(1)
            body.send_keys(Keys.HOME)

            try:
                picture = self.driver.find_elements_by_css_selector('figure')[
                    randrange(1, 5)]
                hov = ActionChains(driver).move_to_element(picture)
                hov.perform()
            except:
                pass

            sel = Selector(text=self.driver.page_source)
            adverts = sel.xpath('//article[contains(@class, "bg-white")]')

            for advert in adverts:
                try:
                    l = ItemLoader(item=IslandScraperItem(), selector=advert)
                    title = advert.xpath(
                        './/a[contains(@class, "inline-block hover-underline")]/text()'
                    ).extract_first()
                    link_string = advert.xpath(
                        './/a[contains(@class, "inline-block hover-underline")]/@href'
                    ).extract_first()
                    link = "https://www.kyero.com" + link_string
                    locality = title.split(" in ")[1]
                    details = advert.xpath(
                        './/ul[contains(@class, "flex")]/li/span/text()')
                    price_string = advert.xpath(
                        './/span[contains(@class, "p-5")]/text()'
                    ).extract_first()[1:]
                    if price_string:
                        price = price_string.split(" ")[1]
                        price = price[1:]
                        price = price.replace(",", "")
                    beds = advert.xpath(
                        './/ul[contains(@class, "flex")]/li/span/text()'
                    ).extract_first()
                    size_string = advert.xpath(
                        './/ul[@class="flex"]/li/span/text()')[-1].extract()
                    size = size_string.split(" ")[0]
                    date = datetime.today().strftime('%Y-%m-%d')

                except:
                    pass

                l.add_value('title', title)
                l.add_value('island', "Tenerife")
                l.add_value('locality', locality)
                l.add_value('price', price)
                l.add_value('beds', beds)
                l.add_value('size', size)
                l.add_value('link', link)
                l.add_value('date', date)
                l.add_value('ad_type', "sale")
                yield l.load_item()

            sleep(5)
            self.driver.quit()
    def parse(self, response):

        self.driver = webdriver.Chrome(
            'D:/PYTHON/WebScraping/chromedriver',
            chrome_options=options)  ## path to chromedriver on disk
        self.driver.get('https://linkedin.com/')

        ## Login handling

        username = self.driver.find_element_by_class_name('login-email')
        username.send_keys('XXXX')
        sleep(0.5)

        password = self.driver.find_element_by_id('login-password')
        password.send_keys('XXXXX')
        sleep(0.5)

        sign_in_button = self.driver.find_element_by_xpath(
            '//*[@type="submit"]')
        sign_in_button.click()
        sleep(2)

        for element in link_urls:
            l = ItemLoader(item=FaangItem(), selector=element)
            self.driver.get(element)

            ## Window scroller to discover button

            self.driver.execute_script("window.scrollTo(0, 1600);")
            try:
                show_more_button = self.driver.find_element_by_xpath(
                    '//*[@class="pv-profile-section__card-action-bar pv-skills-section__additional-skills artdeco-container-card-action-bar"]'
                )
            except:
                sleep(1)
                try:
                    self.driver.execute_script("window.scrollTo(0, 2100);")
                    show_more_button = self.driver.find_element_by_xpath(
                        '//*[@class="pv-profile-section__card-action-bar pv-skills-section__additional-skills artdeco-container-card-action-bar"]'
                    )
                except:
                    sleep(1)
                    try:
                        self.driver.execute_script("window.scrollTo(0, 2600);")
                        show_more_button = self.driver.find_element_by_xpath(
                            '//*[@class="pv-profile-section__card-action-bar pv-skills-section__additional-skills artdeco-container-card-action-bar"]'
                        )
                    except:
                        sleep(1)
                        try:
                            self.driver.execute_script(
                                "window.scrollTo(0, 3600);")
                            show_more_button = self.driver.find_element_by_xpath(
                                '//*[@class="pv-profile-section__card-action-bar pv-skills-section__additional-skills artdeco-container-card-action-bar"]'
                            )
                        except:
                            pass

            sleep(2)

            ## Skill & Country Extractor:

            try:
                actions = ActionChains(self.driver)
                actions.move_to_element(show_more_button).perform()
                show_more_button.click()
                sleep(3)
                sel = Selector(text=self.driver.page_source)
                country = sel.xpath(
                    "normalize-space(.//h3/text())").extract_first()
                top_skills = sel.xpath(
                    './/*[@class="Sans-17px-black-100%-semibold"]/text()'
                ).extract()[0:3]

                div = sel.xpath(
                    './/div[@class="pv-skill-category-list pv-profile-section__section-info mb6 ember-view"]'
                )
                skill_sets = []
                for group in div:
                    skill_group = group.xpath('./h3/text()').extract_first()
                    skills = group.xpath(
                        './/*[@class="pv-skill-category-entity__name "]/a/span/text()'
                    ).extract()
                    skill_set = {skill_group: skills}
                    skill_sets.append(skill_set)

                l.add_value('country', country)
                l.add_value('top_skills', top_skills)
                l.add_value('skill_sets', skill_sets)

            except:
                pass

            yield l.load_item()

        self.driver.quit()
Beispiel #40
0
    def parse_item(self, response):
        """
        @url https://www.vinabook.com/lam-quen-thong-ke-hoc-qua-biem-hoa-p71348.html
        @returns items 1
        @scrapes name name_unidecode price description
        @scrapes url project spider server date
        """
        l = ItemLoader(item=BooksItem(), response=response)

        l.add_value('name', l.get_xpath('//*[@itemprop="title"]/text()')[-1])
        l.add_value(
            'name_unidecode',
            unidecode(l.get_xpath('//*[@itemprop="title"]/text()')[-1]))
        l.add_xpath('price',
                    '//*[contains(@id, "discounted_price")]/span/text()',
                    TakeFirst())
        l.add_xpath('author', '//*[@itemprop="author"]/text()')
        l.add_value(
            'description',
            filter(None, [
                re.sub('<[^<]+?>', '', i)
                for i in l.get_xpath('//*[@class="full-description"]/p')
            ]), Join('\n'))
        l.add_xpath('image_uri', '//*[@itemprop="image"]/@src')

        # Information fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
Beispiel #41
0
    def parse_item(self, response):
        # print(response.data['html'])
        item = ItemLoader(item=Hb5217Item(), response=response)
        url = response.url
        item_list = item_code(url, self.web_name, 'id=(.*?)$')
        item.add_value('web_name', self.web_name)
        item.add_value('web_code', self.name)
        item.add_value('url', url)
        item.add_value('item_code', item_list.get('item_code'))

        item.add_css('title', 'h3 a::text')
        item.add_css('amount', '.listcon em::text')
        item.add_css('rate', '.listcon em::text')
        item.add_css('period', '.listcon em::text')
        # item.add_css('loan_using', '::text')
        item.add_css('loaner_info', 'dl')
        item.add_css('pay_type', 'li em::text')
        item.add_css('progress', '#progressPrecent::text')

        # invest records
        i_v = []
        invest_records_temp = '{{username={lst[3]}|rate=-1|postmoney={lst[6]}|money={lst[6]}|postdate={lst[8]}|status=全部通过}}'
        invest_records_format = ""
        tr = response.css('.box_view_4 tr').css('tr')
        try:
            for i in tr:
                lst = i.css('td *::text').extract()
                if lst:
                    i_v.append(lst)
            # print(i_v)
            for n in i_v:
                invest_records_format += invest_records_temp.format(lst=n)
            item.add_value('invest_records', invest_records_format)
            item.add_value('start', i_v[0][8])
            item.add_value('end', i_v[-1][8])
        except Exception:
            self.logger.info('invest records is error %s' % url)

        yield item.load_item()
Beispiel #42
0
    def parse_interview(self, response):
        # # date
        broadcast_date = response.xpath(
            '//div[@class="date left"]//strong/text()').get()
        locale.setlocale(locale.LC_TIME, 'ru_RU.UTF-8')
        broadcast_date = datetime.datetime.strptime(broadcast_date,
                                                    u'%d %B %Y').date()

        # guest name
        guest_name = response.xpath(
            '//div[contains(@class, "author")]//*[@class="name"]/text()').get(
            )
        # # guest title
        guest_title = response.xpath(
            '//div[contains(@class, "author")]//*[@class="post"]/text()').get(
            )
        # # host name
        host_name = response.xpath(
            '//div[contains(@class, "lead")]//a//text()').get()

        interview_exists = session_test.query(exists().where(
            and_(InterviewParagraph.date == broadcast_date,
                 InterviewParagraph.guest_name == guest_name))).scalar()
        if not interview_exists:
            text = response.xpath('//div[@class="mmplayer"]//p').getall()
            whole_interview = []
            current_text = ""
            current_speaker = ""
            for index, paragraph in enumerate(text):
                # chunk_name = paragraph.xpath('name()')

                chunk = clean_chunk(paragraph)
                if len(chunk) > 1:
                    current_speaker = chunk[0]
                    current_text = chunk[-1]
                elif len(chunk) == 1:
                    current_text += " "
                    current_text += chunk[0]

                if (index + 1) < len(text):
                    next_chunk = clean_chunk(text[index + 1])
                    if len(next_chunk) != 1 and len(current_text) > 0:
                        if len(current_speaker) > 0:
                            whole_interview.append(
                                [index, current_speaker, current_text])
                        else:
                            whole_interview[-1][2] = whole_interview[-1][
                                2] + " " + current_text
                        current_text = ""
                        current_speaker = ""
                    else:
                        pass
                else:
                    if len(current_text) > 0:
                        whole_interview.append(
                            [index, current_speaker, current_text])
                    # current_text = ""
                    # current_speaker = ""
            # print(whole_interview)
            # for i in whole_interview:
            #     # print(i)
            #     pass

            entry = {
                "date": broadcast_date,
                "guest_name": guest_name,
                "guest_title": guest_title,
                "host_name": host_name,
                "interview": whole_interview,
            }

            if len(entry['interview']) > 10:

                print("=======================================")
                print(entry['date'])
                print(entry['guest_name'])
                print(len(entry['interview']))

                for i in entry['interview']:
                    interview_item = ItemLoader(item=InterviewItem(),
                                                response=response)

                    interview_item.add_value('date', entry['date'])

                    interview_item.add_value('guest_name', entry['guest_name'])
                    interview_item.add_value('guest_title',
                                             entry['guest_title'])
                    interview_item.add_value('host_name', entry['host_name'])
                    interview_item.add_value('index', int(i[0]))
                    interview_item.add_value('speaker', str(i[1]))
                    interview_item.add_value(
                        'text',
                        str(i[2]).encode('utf-8').decode('utf-8'))
                    # interview_item.add_value('text', str(i[2]).decode('utf-8', 'ignore'))
                    interview_item.add_value('url', response.url)

                    item = interview_item.load_item()
                    yield (item)
Beispiel #43
0
    def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=Collector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("scraped_text", [self.extractText(response)])
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))

        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath(
            "//frame/@src").extract() + response.xpath(
                "//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]

        #attach the urlstack, the loader, and the fingerprints to the response...
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
Beispiel #44
0
 def errorback(self, failure):
     loader = ItemLoader(item=Collector())
     if failure.check(HttpError):
         response = failure.value.response
         loader.add_value("dl_slot",
                          response.request.meta.get('download_slot'))
         loader.add_value("start_page", "")
         loader.add_value("scraped_urls", "")
         loader.add_value("redirect", [None])
         loader.add_value("scraped_text", "")
         loader.add_value("error", response.status)
         loader.add_value("ID", response.request.meta["ID"])
         yield loader.load_item()
     elif failure.check(DNSLookupError):
         request = failure.request
         loader.add_value("dl_slot", request.meta.get('download_slot'))
         loader.add_value("start_page", "")
         loader.add_value("scraped_urls", "")
         loader.add_value("redirect", [None])
         loader.add_value("scraped_text", "")
         loader.add_value("error", "DNS")
         loader.add_value("ID", request.meta["ID"])
         yield loader.load_item()
     elif failure.check(TimeoutError, TCPTimedOutError):
         request = failure.request
         loader.add_value("dl_slot", request.meta.get('download_slot'))
         loader.add_value("start_page", "")
         loader.add_value("scraped_urls", "")
         loader.add_value("redirect", [None])
         loader.add_value("scraped_text", "")
         loader.add_value("error", "Timeout")
         loader.add_value("ID", request.meta["ID"])
         yield loader.load_item()
     else:
         request = failure.request
         loader.add_value("dl_slot", request.meta.get('download_slot'))
         loader.add_value("start_page", "")
         loader.add_value("scraped_urls", "")
         loader.add_value("redirect", [None])
         loader.add_value("scraped_text", "")
         loader.add_value("error", "other")
         loader.add_value("ID", request.meta["ID"])
         yield loader.load_item()
Beispiel #45
0
    def parse_race_denma(self, response):
        """ Parse denma page.

        @url https://keiba.yahoo.co.jp/race/denma/1906050201/
        @returns items 1
        @returns requests 1
        @race_denma
        """
        logger.info(f"#parse_race_denma: start: url={response.url}")

        # Parse race info
        logger.debug("#parse_race_denma: parse race info")

        loader = ItemLoader(item=RaceInfoItem(), response=response)
        race_id = response.url.split("/")[-2]
        loader.add_value("race_id", race_id)
        loader.add_xpath("race_round", "//td[@id='raceNo']/text()")
        loader.add_xpath("start_date", "//p[@id='raceTitDay']/text()[1]")
        loader.add_xpath("start_time", "//p[@id='raceTitDay']/text()[3]")
        loader.add_xpath("place_name", "//p[@id='raceTitDay']/text()[2]")
        loader.add_xpath("race_name", "//div[@id='raceTitName']/h1/text()")
        loader.add_xpath("course_type_length",
                         "//p[@id='raceTitMeta']/text()[1]")
        loader.add_xpath("weather", "//p[@id='raceTitMeta']/img[1]/@alt")
        loader.add_xpath("course_condition",
                         "//p[@id='raceTitMeta']/img[2]/@alt")
        loader.add_xpath("race_condition_1",
                         "//p[@id='raceTitMeta']/text()[6]")
        loader.add_xpath("race_condition_2",
                         "//p[@id='raceTitMeta']/text()[7]")
        loader.add_xpath("added_money", "//p[@id='raceTitMeta']/text()[8]")
        i = loader.load_item()

        logger.debug(f"#parse_race_denma: race info={i}")
        yield i

        # Parse race denma
        logger.debug("#parse_race_denma: parse race denma")

        for tr in response.xpath(
                "//table[contains(@class, 'denmaLs')]/tr[position()>1]"):
            loader = ItemLoader(item=RaceDenmaItem(), selector=tr)
            loader.add_value("race_id", race_id)
            loader.add_xpath("bracket_number", "td[1]/span/text()")
            loader.add_xpath("horse_number", "td[2]/strong/text()")
            loader.add_xpath("horse_id", "td[3]/a/@href")
            loader.add_xpath("trainer_id", "td[3]/span/a/@href")
            loader.add_xpath("horse_weight_and_diff", "string(td[4])")
            loader.add_xpath("jockey_id", "td[5]/a/@href")
            loader.add_xpath("jockey_weight", "td[5]/text()")
            loader.add_xpath("prize_total_money", "td[7]/text()[3]")
            i = loader.load_item()

            logger.debug(f"#parse_race_denma: race denma={i}")
            yield i

        # Parse link
        logger.debug("#parse_race_denma: parse link")

        for a in response.xpath("//a"):
            href = a.xpath("@href").get()

            if href.startswith("/directory/horse/") \
                    or href.startswith("/directory/trainer/") \
                    or href.startswith("/directory/jocky/"):
                yield self._follow_delegate(response, href)

        yield self._follow_delegate(response, f"/odds/tfw/{race_id}/")
        yield self._follow_delegate(response, f"/race/result/{race_id}/")
Beispiel #46
0
    def parse_item(self, response):
        if response.status == 200:
            soup = bs(response.text, "lxml")
            print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
            # 详情页抽取数据
            l = ItemLoader(item=JingdongItem(), response=response)
            # 使用add_xpath方法,传递Item类的字段名称和对应的xpath解析语法
            l.add_value('cate_key', response.meta['cate_key'])
            l.add_value('goods_url', response.url)
            try:
                l.add_xpath('platform', '//div[@id="logo-2014"]/a/text()')
            except:
                l.add_value('platform', '')

            try:
                l.add_xpath('shop_name',
                            '//div[@id="popbox"]/div/div[1]/h3/a/text()')
            except:
                l.add_value('shop_name', '')

            try:
                l.add_xpath('goods_name', '//div[@class="sku-name"]/text()')
            except:
                l.add_value('goods_name', '')

            try:
                l.add_xpath(
                    'now_price',
                    '//div[@class="summary-price-wrap"]/div[1]/div[2]/span/span[2]/text()'
                )
            except:
                l.add_value('now_price', '')

            try:
                l.add_xpath('origin_price',
                            '//*[@id="page_origin_price"]/text()')
            except:
                l.add_value('origin_price', '')

            # try:
        #         l.add_xpath('mon_sales', '//p[@id="price"]')
        # except:
        #     l.add_value('mon_sales', '')

            try:
                l.add_xpath('total_views',
                            '//div[@id="comment-count"]/a/text()')
            except:
                l.add_value('total_views', '')

            # try:
            #     l.add_xpath('stock', '//p[@id="price"]')
            # except:
            #     l.add_value('stock', '')

            try:
                l.add_xpath('brand', '//ul[@id="parameter-brand"]/li/a/text()')
            except:
                l.add_value('brand', '')

            try:
                details = soup.select('.p-parameter > ul[2] > li')
                for i in range(len(details)):
                    l.add_value('item{}'.format(i), details[i].string)
            except:
                for i in range(9):
                    l.add_value('item{}'.format(i), '')
            yield l.load_item()
        elif response.status == 202:
            cate_key = response.meta['cate_key']
            yield Request(response.url,
                          callback=self.parse_item,
                          dont_filter=True,
                          meta={
                              'cate_key': cate_key,
                          })
Beispiel #47
0
    def parse_item(self, response):
        """
        @url http://splash:8050/render.html?&url=https://vlogtruyen.net/bokutachi-wa-hanshoku-wo-yameta.html&wait=1
        @scrapes name unicode_name source image_src total_chap description chapters web_source full
        """

        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath("unicode_name",
                        '//h1[@class="title-commic-detail"]/text()')
        manga.add_value("name",
                        unidecode(manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath("image_src", '//meta[@property="og:image"]/@content')
        manga.add_xpath("description",
                        '//*[@class="desc-commic-detail"]/text()', Join("\n"))
        chapter_xpath = '//*[@class="ul-list-chaper-detail-commic"]/li/a'
        chapter_source = manga.get_xpath(chapter_xpath + "/@href")
        chapter_name = manga.get_xpath(chapter_xpath + "/h3/text()")
        chapters = zip(chapter_name, chapter_source)

        if "Đã hoàn thành" in manga.get_xpath(
                '//*[@class="manga-status"]/p/text()'):
            manga.add_value("full", True)
        else:
            manga.add_value("full", False)

        manga.add_value(
            "total_chap",
            manga.get_xpath(
                '//*[@class="ul-list-chaper-detail-commic"]/li[1]/a/h3/text()',
                MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)),
                TakeFirst(),
            ),
        )
        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "vlogtruyen")

        return manga.load_item()
    def parse(self, response):
        l = ItemLoader(item=YoutubeTrendingItem(), response=response)
        self.driver.get(response.url)
        self.driver.execute_script('window.scrollTo(1, 500);')
        sleep(5)
        self.driver.execute_script('window.scrollTo(1, 3000);')

        sel = Selector(text=self.driver.page_source)

        title = self.get_title(sel),
        url = self.get_url(response),
        views = self.get_views(sel),
        duration = self.get_duration(sel),
        likes = self.get_likes(sel),
        dislikes = self.get_dislikes(sel),
        channelName = self.get_channel_name(sel),
        subscribers = self.get_subscribers(sel),
        description = self.get_description(sel),
        keywords = self.get_keywords(sel),
        date_published = self.get_date_published(sel),
        date_scraped = self.get_date_scraped()
        tags = self.get_tags(sel),
        #n_comments = self.get_number_of_comments(sel),
        image_urls = self.get_image_url(response),
        comments = self.get_comments(),

        l.add_value('title', title)
        l.add_value('url', url)
        l.add_value('views', views)
        l.add_value('duration', duration)
        l.add_value('likes', likes)
        l.add_value('dislikes', dislikes)
        l.add_value('channelName', channelName)
        l.add_value('subscribers', subscribers)
        l.add_value('description', description)
        l.add_value('keywords', keywords)
        l.add_value('date_published', date_published)
        l.add_value('date_scraped', date_scraped)
        l.add_value('tags', tags)
        #l.add_value('n_comments', n_comments)
        l.add_value('comments', comments)
        l.add_value('image_urls', image_urls)

        yield l.load_item()  #return l.load_item()
Beispiel #49
0
 def parse_item(self, response):
     with open('item1.html', 'w', encoding='utf-8-sig') as f:
         f.write(str(response.data['html']))
     print('startparseitem---------------------------------')
     print(response.real_url)
     print(response.css('.p-parameter-list *::text').extract())
     l = ItemLoader(item=JiadianItem(), response=response)
     l.add_css('name', '.sku-name::text', TakeFirst())
     l.add_css('id', '.follow::attr(data-id)', TakeFirst())
     l.add_css('price', '.price::text', TakeFirst())
     l.add_css('brand', '#parameter-brand a::text', TakeFirst())
     l.add_css('parameter', '.p-parameter-list *::text')
     l.add_value('parameter', '无')
     l.add_css('summary_service', '#summary-service span::text',
               TakeFirst())
     l.add_css('summary_service', '#summary-service a::text', TakeFirst())
     l.add_css('add_service', '#summary-support span::text')
     l.add_value('add_service', '无')
     l.add_css('sales_promotion', '.p-promotions em.hl_red::text')
     l.add_css('sales_promotion', '.prom-item em.hl_red::text')
     l.add_value('sales_promotion', '无')
     l.add_css('store', '.J-hove-wrap a::text', TakeFirst())
     l.add_css('store_link', '.J-hove-wrap a::attr(href)', TakeFirst())
     l.add_css('store', 'strong a[clstag*="allpingjia"]::text', TakeFirst())
     l.add_css('store_link', 'strong a[clstag*="allpingjia"]::attr(href)',
               TakeFirst())
     l.add_css('commentsnum',
               '.J-comments-list li[clstag*="allpingjia"]::attr(data-num)',
               TakeFirst())
     l.add_css('goodcomments', '.percent-con::text', TakeFirst())
     l.add_css('goodcommentnum',
               '.J-comments-list li[clstag*="haoping"]::attr(data-num)',
               TakeFirst())
     l.add_css('comment_tags', '.tag-list span::text')
     l.add_css('mediumcommentnum',
               '.J-comments-list li[clstag*="zhongping"]::attr(data-num)',
               TakeFirst())
     l.add_css('badcommentnum',
               '.J-comments-list li[clstag*="chaping"]::attr(data-num)',
               TakeFirst())
     l.add_value('store', '无')
     l.add_value('store_link', '无')
     l.add_value('commentsnum', '0')
     l.add_value('goodcomments', '0')
     l.add_value('goodcommentnum', '0')
     l.add_value('comment_tags', '无')
     l.add_value('mediumcommentnum', '0')
     l.add_value('badcommentnum', '0')
     l.add_value('summary_service', '无')
     l.add_value('url', response.url)
     l.add_value('price', response.meta['price'])
     l.add_value('brand', '范思哲(VERSACE)')
     return l.load_item()
Beispiel #50
0
 def save_to_csv(self, response, **meta):
     # self.state['items_count'] = self.state.get('items_count', 0) + 1
     il = ItemLoader(item=NmSosSpiderItem(), response=response)
     il.default_input_processor = MapCompose(lambda v: v.strip(),
                                             remove_tags,
                                             replace_escape_chars)
     #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('company_name', meta['company_name'])
     il.add_value('entity_id', meta['business_id'])
     il.add_value('dba_name', meta['dba_name'])
     il.add_value('company_subtype', meta['company_subtype'])
     il.add_value('non_profit_indicator', meta['non_profit_indicator'])
     il.add_value('location_address_string',
                  meta['location_address_string'])
     il.add_value('status', meta['status'])
     il.add_value('creation_date', meta['creation_date'])
     il.add_value('domestic state', meta['domestic_state'])
     il.add_value('period of duration', meta['peroid_of_duration'])
     il.add_value('business purpose', meta['business_purpose'])
     il.add_value('mixed_subtype', meta['officer_title'])
     il.add_value('mixed_name', meta['officer_name'])
     il.add_value('person_address_string', meta['officer_address'])
     il.add_value('permit_type', 'business_license')
     il.add_value('sourceName', 'NM_SOS')
     il.add_value(
         'url',
         'https://portal.sos.state.nm.us/BFS/online/CorporationBusinessSearch'
     )
     return il
    def parse_product(self, response):

        # Stops if the COUNTER reaches the maximum set value
        if self.COUNTER >= self.COUNT_MAX:
            raise scrapy.exceptions.CloseSpider(
                reason='COUNT_MAX value reached - {} items'.format(
                    self.COUNT_MAX))

        # Check if the product is available
        no_available_message = response.xpath('//h2[contains(text(), "Darn")]')
        if no_available_message:
            return []

        # Create the ItemLoader object that stores each product information
        l = ItemLoader(item=ProductItem(), response=response)

        # Get the product ID (ex: 666125766)
        product_id = response.url.split('/')[4]
        l.add_value('product_id', product_id)

        # Get the produc Title
        #l.add_xpath('title', '//meta[@property="og:title"]/@content')
        l.add_xpath(
            'title',
            '//div[@data-component="listing-page-title-component"]/h1/text()')
        #l.add_xpath('title', "//h1[@data-listing-id='{}']".format(response.url.split('/')[4]))

        # Get the product price
        l.add_xpath('price', '//*[contains(@data-buy-box-region, "price")]//p')

        # Get the product URL (ex: www.etsy.com/listing/666125766)
        l.add_value('url', '/'.join(response.url.split('/')[2:5]))

        # Get the product description
        l.add_xpath('description',
                    '//div[@data-id="description-text"]/div/p/text()')

        # Get each product option and save in a list
        product_options = []
        product_options_list = response.xpath(
            '//*[contains(@id, "inventory-variation-select")]')
        for options in product_options_list:
            # Get list of options
            temp_list = options.xpath('.//text()').extract()
            # Remove '\n' strings
            temp_list = list(map(lambda s: s.strip(), temp_list))
            # Remove empty strings ('')
            temp_list = list(filter(lambda s: s != '', temp_list))

            # Filter the 'Quantity' option
            if temp_list[0] != '1':
                # Create the final string:
                # example: "Select a color: White, Black, Red, Silver"
                product_options.append(temp_list[0] + ': ' +
                                       ', '.join(temp_list[1:]))

        # Separate each option with a | (pipe) symbol
        l.add_value('product_options', '|'.join(product_options))

        # Get the product rating (ex: 4.8 )
        l.add_xpath('rating',
                    '//a[@href="#reviews"]//input[@name="rating"]/@value')

        # Get the number of votes (number of reviews)
        l.add_xpath('number_of_reviews',
                    '//button[@id="same-listing-reviews-tab"]/span/text()')

        # Count the number of product images
        images_sel = response.xpath(
            '//ul[@data-carousel-pagination-list=""]/li/img/@data-src-delay'
        ).extract()
        l.add_value('count_of_images', len(images_sel))
        l.add_value('images_urls', images_sel)

        # Get the product overview
        #l.add_xpath('overview', '//*[@class="listing-page-overview-component"]//li')

        # Get the number of people that add the product in favorites
        l.add_xpath(
            'favorited_by',
            '//*[@id="item-overview"]//*[contains(@href, "/favoriters")]/text()',
            re='(\d+)')
        l.add_xpath('favorited_by',
                    '//*[@class="listing-page-favorites-link"]/text()',
                    re='(\d+)')
        l.add_xpath('favorited_by',
                    '//a[contains(text(), " favorites")]/text()',
                    re='(\d+)')

        # Get the name of the Store and location
        l.add_xpath('store_name',
                    '//div[@id="listing-page-cart"]//span/text()')
        #l.add_xpath('store_location', '//*[@id="shop-info"]/div')
        #l.add_xpath('return_location', "//*[@class='js-estimated-delivery']/following-sibling::div")

        # Use the chosen method to get the reviews
        self.logger.info('Reviews scraping option: ' + str(self.reviews_opt))

        # Option 3 - All reviews
        if self.reviews_opt == 3:
            # Getting all Reviews
            store_name = response.xpath(
                '//span[@itemprop="title"]//text()').extract_first()
            # Build the reviews URL
            rev_url = "https://www.etsy.com/shop/{}/reviews?ref=l2-see-more-feedback".format(
                store_name)
            data = {'itemLoader': l, 'product_id': product_id}

            # Go to the all reviews page
            yield Request(rev_url, meta=data, callback=self.parse_reviews)

        # Option 2 - Ajax request
        elif self.reviews_opt == 2:
            # Creating the Ajax request
            # Getting the session cookie
            get_cookie = response.request.headers['Cookie'].split(
                b';')[0].split(b'=')
            cookies = {
                get_cookie[0].decode("utf-8"): get_cookie[1].decode("utf-8")
            }

            # Getting the x-csrf-token
            headers = {
                'x-csrf-token':
                response.xpath("//*[@name='_nnc']/@value").extract_first()
            }

            # Shop Id
            shop_id = response.xpath("//*[@property='og:image']/@content"
                                     ).extract_first().split('/')[3]

            formdata = {
                'stats_sample_rate': '',
                'specs[reviews][]': 'Listzilla_ApiSpecs_Reviews',
                'specs[reviews][1][listing_id]': product_id,
                'specs[reviews][1][shop_id]': shop_id,
                'specs[reviews][1][render_complete]': 'true'
            }

            data = {'itemLoader': l, 'product_id': product_id}
            ajax_url = "https://www.etsy.com/api/v3/ajax/bespoke/member/neu/specs/reviews"

            yield scrapy.FormRequest(ajax_url,
                                     headers=headers,
                                     cookies=cookies,
                                     meta=data,
                                     formdata=formdata,
                                     callback=self.parse_ajax_response)
        # Option 1
        else:
            # Dict that saves all the reviews data
            reviews_data = []
            reviews_counter = 1

            # Get the data from each review
            all_reviews = response.xpath(
                '//*[@class="listing-page__review col-group pl-xs-0 pr-xs-0"]')
            # Process each review
            for r in all_reviews:

                # Get the profile URL of the reviewer
                reviewer_profile = r.xpath(
                    ".//*[@class='display-block']/parent::*//@href"
                ).extract_first()
                if reviewer_profile:
                    # Build the full profile url
                    reviewer_profile = 'www.etsy.com' + reviewer_profile
                else:
                    # If the profile is inactive there is no profile url
                    continue

                review_date = r.xpath(
                    ".//*[@class='text-link-underline display-inline-block mr-xs-1']/parent::*//text()"
                ).extract()[2].strip()
                reviewer_rating = r.xpath(
                    './/input[@name="rating"]/@value').extract_first()
                review_content = " ".join(
                    r.xpath('.//div[@class="overflow-hidden"]//text()').
                    extract()).strip()

                # Build the review string
                rev_data = "Review number: {} \nProfile: {} \nRating: {} \nDate: {} \nContent: {}".format(
                    reviews_counter, reviewer_profile, reviewer_rating,
                    review_date, review_content)

                # Save into the list
                reviews_data.append(rev_data)
                reviews_counter += 1

            # Saves all reviews data
            l.add_value('reviews', "\n\n".join(reviews_data))

            # Increment the items counter
            self.COUNTER += 1
            print('\nProducts scraped: {}\n'.format(self.COUNTER))

            yield l.load_item()
Beispiel #52
0
    def parse_odds_win_place(self, response):
        """ Parse odds win place page.

        @url https://keiba.yahoo.co.jp/odds/tfw/1906050201/?ninki=0
        @returns items 1
        @returns requests 1
        @odds_win_place
        """
        logger.info(f"#parse_odds_win_place: start: url={response.url}")

        race_id = response.url.split("/")[-2]

        # Parse odds win place
        for tr in response.xpath("//table[@class='dataLs oddTkwLs']/tbody/tr"):
            if len(tr.xpath("th")) > 0:
                continue

            loader = ItemLoader(item=OddsWinPlaceItem(), selector=tr)
            loader.add_value("race_id", race_id)
            loader.add_xpath("horse_number", "td[2]/text()")
            loader.add_xpath("horse_id", "td[3]/a/@href")
            loader.add_xpath("odds_win", "td[4]/text()")
            loader.add_xpath("odds_place_min", "td[5]/text()")
            loader.add_xpath("odds_place_max", "td[7]/text()")

            i = loader.load_item()

            logger.debug(f"#parse_odds_win_place: odds_win_place={i}")
            yield i

        # Parse odds bracket quinella
        for tr in response.xpath("//table[@class='oddsLs']/tbody/tr"):
            th = tr.xpath("th")
            if "class" in th.attrib:
                bracket_number_1 = th.xpath("div/text()").get()
            else:
                loader = ItemLoader(item=OddsBracketQuinellaItem(),
                                    selector=tr)
                loader.add_value("race_id", race_id)
                loader.add_value("bracket_number_1", bracket_number_1)
                loader.add_value("bracket_number_2", th.xpath("text()").get())
                loader.add_xpath("odds", "td/text()")

                i = loader.load_item()

                logger.debug(
                    f"#parse_odds_win_place: odds_bracket_quinella={i}")
                yield i

        # Parse link
        logger.debug("#parse_odds_win_place: parse link")

        for a in response.xpath("//a"):
            href = a.xpath("@href").get()

            if not href:
                # hrefがNoneである場合がある
                continue

            if href.startswith("/odds/ut/") \
                    or href.startswith("/odds/ur/") \
                    or href.startswith("/odds/wide/") \
                    or href.startswith("/odds/st/") \
                    or href.startswith("/odds/sf/"):
                yield self._follow_delegate(response, href)
Beispiel #53
0
    def parse_contents(self, response):
        item_loader = ItemLoader(item=ReCrawlerItem(), response=response)
        item_loader.add_xpath('title', '//font[@class="headtitle"]/text()')
        item_loader.add_value('url', response.url)

        return item_loader.load_item()
Beispiel #54
0
 def get_news(self, response):
     try:
         data = response.xpath('//div[@id="textBox"]')
         content = data.xpath('string(.)').extract_first()
         item = response.meta['item']
         item['content'] = content[0:content.find(u'分享到:')]
         item['collection_name'] = self.name
         item['website'] = self.website
         yield item
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' % (response.url, e))
         l = ItemLoader(item=SpiderItem(), response=response)
         l.add_value('title', '')
         l.add_value('date', '1970-01-01 00:00:00')
         l.add_value('source', '')
         l.add_value('content', '')
         l.add_value('url', response.url)
         l.add_value('collection_name', self.name)
         l.add_value('website', self.website)
         yield l.load_item()
    def parse_item(self, response):

        l = ItemLoader(item=RentslamItem(), response=response)

        #All data must be extracted using XPATH queries
        image_url = response.xpath(
            '//*[@class="carousel-inner"]//@src').extract_first()
        url = response.url
        price = response.xpath(
            './/div[contains(@class,"aanbod-info-price")]/text()'
        ).extract_first()
        bedrooms = response.xpath(
            './/p[contains(@class,"aanbod-ifo-rooms")]/text()').extract_first(
            )
        size = response.xpath(
            './/p[contains(@class,"aanbod-ifo-squarefeet")]/text()'
        ).extract_first()
        address = response.xpath(
            './/h1[contains(@class,"aanbod-ifo-street")]/text()'
        ).extract_first()
        text_list = response.xpath(
            './/div[contains(@class,"wpb_wrapper")]/text()').extract()
        text = (''.join(text_list)).strip()
        # Furnishing in Dutch Oplevering
        furnishing = response.xpath(
            './/p[contains(@class,"aanbod-ifo-furniture")]/text()'
        ).extract_first()

        #Full url (mandatory)
        l.add_value('ImageUrl', image_url)

        #Full url (mandatory)
        l.add_value('Url', url)

        #Price must not include currency symbol, dot or comma. Decimals must be filtered out. Example: € 1.348,77 --> 1348 (mandatory)
        l.add_value('Price', price, Join(''), re=r'\d+')

        #Number (if present). Bedrooms is "Slaapkamers" in Dutch
        l.add_value('Bedrooms', bedrooms, TakeFirst(), re=r'\d+')

        #Size must include only the number. Things like "m2" must be filtered out. Example: 90 m2 --> 90 (if present)
        l.add_value('Size', size, TakeFirst(), re=r'\d+')

        #The address must contain the street name (mandatory) and the house number (if it is present). It must not contain the city name or the postcode
        l.add_value('Address', address)

        #This is the description of the listing (if present)
        l.add_value('Text', text)

        #You can copy the email address from the website here (if present)
        l.add_value('ContactEmailAddress', '*****@*****.**')

        #You can copy the phone number from the website here (if present)
        l.add_value('ContactPhoneNumber', '+31 20 672 33 31')

        l.add_value('Furnishing',
                    furnishing.replace('Oplevering:', '').strip())

        l.add_value('City', 'Amsterdam')

        yield l.load_item()
Beispiel #56
0
    def parse_horse(self, response):
        """ Parse horse page.

        @url https://keiba.yahoo.co.jp/directory/horse/2017101602/
        @returns items 1 1
        @returns requests 0 0
        @horse
        """
        logger.info(f"#parse_horse: start: url={response.url}")

        horse_id = response.url.split("/")[-2]

        # Parse horse
        loader = ItemLoader(item=HorseItem(), response=response)
        loader.add_value("horse_id", horse_id)
        loader.add_xpath("gender", "string(//div[@id='dirTitName']/p)")
        loader.add_xpath("name", "//div[@id='dirTitName']/h1/text()")
        loader.add_xpath("birthday", "//div[@id='dirTitName']/ul/li[1]/text()")
        loader.add_xpath("coat_color",
                         "//div[@id='dirTitName']/ul/li[2]/text()")
        loader.add_xpath("trainer_id",
                         "//div[@id='dirTitName']/ul/li[3]/a/@href")
        loader.add_xpath("owner", "//div[@id='dirTitName']/ul/li[4]/text()")
        loader.add_xpath("breeder", "//div[@id='dirTitName']/ul/li[5]/text()")
        loader.add_xpath("breeding_farm",
                         "//div[@id='dirTitName']/ul/li[6]/text()")

        tdBloodM = response.xpath(
            "//table[@id='dirUmaBlood']/tr/td[@class='bloodM']/text()")
        loader.add_value("parent_horse_name_male_1", tdBloodM[0].get())
        loader.add_value("parent_horse_name_male_21", tdBloodM[1].get())
        loader.add_value("parent_horse_name_male_31", tdBloodM[2].get())
        loader.add_value("parent_horse_name_male_32", tdBloodM[3].get())
        loader.add_value("parent_horse_name_male_22", tdBloodM[4].get())
        loader.add_value("parent_horse_name_male_33", tdBloodM[5].get())
        loader.add_value("parent_horse_name_male_34", tdBloodM[6].get())

        tdBloodF = response.xpath(
            "//table[@id='dirUmaBlood']/tr/td[@class='bloodF']/text()")
        loader.add_value("parent_horse_name_female_31", tdBloodF[0].get())
        loader.add_value("parent_horse_name_female_21", tdBloodF[1].get())
        loader.add_value("parent_horse_name_female_32", tdBloodF[2].get())
        loader.add_value("parent_horse_name_female_1", tdBloodF[3].get())
        loader.add_value("parent_horse_name_female_33", tdBloodF[4].get())
        loader.add_value("parent_horse_name_female_22", tdBloodF[5].get())
        loader.add_value("parent_horse_name_female_34", tdBloodF[6].get())
        i = loader.load_item()

        logger.debug(f"#parse_horse: horse={i}")
        yield i
Beispiel #57
0
 def parse_item(self, response):
     logging.info(u"start crawl  --->  " + response.url)
     item = ItemLoader(item=NewsItem(), response=response)
     sel = Selector(response)
     content = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p')
     article_time = content.xpath('//span[@class="pubTime"]/text()').extract()
     date_time = compare_time(article_time, u"%Y年%m月%d日%H:%M")
     if not date_time:
         return
     item.add_xpath('keywords', "//head/meta[@name='keywords']/@content")
     item.add_value('date_time', date_time)
     item.add_xpath('title', '//div[@class="hd"]/h1/text()')
     item.add_xpath('reading_number', '//em[@id="top_count"]/text()')
     item.add_xpath('author', '//span[@class="auth"]/text()')
     item.add_value('original_link', response.url)
     elements = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p').extract()
     images, content = translate_content(elements)
     if images:
         item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg")
     item.add_value('content', content)
     item.add_value('image_urls', images)
     item.add_value('source', u'腾讯科技')
     item.add_value('category', CATEGORY.TECHNOLOGY)
     logging.info(u"finished crawl  --->  " + response.url)
     yield item.load_item()
Beispiel #58
0
 def parse_item(self, response):
     loader = ItemLoader(item=SpiderItem(), response=response)
     date = '1970-01-01'
     content = ''
     try:
         title = response.xpath(
             r'//*[@class="infor_border"]/h1//text()').extract()
         date_raw = response.xpath(
             r'//*[@class="right_sc"]//text()').extract()
         if date_raw is not None:
             date_raw = removern(date_raw)
             date = date_raw.strip().split(" ")[0]
         content = response.xpath(
             r'//*[@class="news_content"]/p//text()').extract()
         loader.add_value('date', date)
         loader.add_value('title', title)
         loader.add_value('content', content)
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' %
                           (response.url, e))
         loader.add_value('date', date)
         loader.add_value('title', 'unknown')
         loader.add_value('content', '')
     finally:
         self.logger.info("crawled url: %s" % response.url)
         loader.add_value('url', response.url)
         loader.add_value('collection_name', self.name)
         loader.add_value("website", self.website)
         if content == '':
             self.logger.warning(' url: %s msg: %s' %
                                 (response.url, ' content is None'))
         yield loader.load_item()
Beispiel #59
0
    def parse_item(self, response):
        """ This function parses a property page.

        @url http://web:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        l.add_value('title', response.meta['title'],
                    MapCompose(str.strip, str.strip))
        l.add_xpath('price',
                    './/*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
                    MapCompose(str.strip), Join())
        l.add_xpath('address',
                    '//*[@itemtype="http://schema.org/Place"][1]/text()',
                    MapCompose(str.strip))
        l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
                    MapCompose(lambda i: urlparse.urljoin(response.url, i)))

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
Beispiel #60
0
 def parse_item(self, response):
     loader = ItemLoader(item=SpiderItem(), response=response)
     content = ''
     try:
         title = response.xpath(r'//*[@class="summi"]/h1//text()').extract()
         date = response.xpath(
             r'//*[@class="summi"]/ul/li[1]//text()').extract()
         if date is None:
             date = date.split(" ")[0].strip()
         else:
             date = '1970-01-01'
         content = response.xpath(r'//*[@class="summs"]//text()').extract()
         loader.add_value('date', date)
         loader.add_value('title', title)
         loader.add_value('content', content)
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' %
                           (response.url, e))
         loader.add_value('date', '1970-01-01')
         loader.add_value('title', 'unknown')
         loader.add_value('content', '')
     finally:
         self.logger.info("crawled url: %s" % response.url)
         loader.add_value('url', response.url)
         loader.add_value('collection_name', self.name)
         loader.add_value("website", self.website)
         if content == '':
             self.logger.warning(' url: %s msg: %s' %
                                 (response.url, ' content is None'))
         yield loader.load_item()