コード例 #1
0
    def parse(self, response):
        jsonresponse = json.loads(response.body_as_unicode())
        # yield jsonresponse

        for c in jsonresponse['list']:
            loader = ItemLoader(item=StockCubesItem())
            loader.default_input_processor = MapCompose(str)
            loader.default_output_processor = Join(' ')

            for (field, path) in self.jmes_paths.items():
                loader.add_value(field, SelectJmes(path)(c))
            item = loader.load_item()

            ownerLoader = ItemLoader(item=OwnerItem())
            ownerLoader.default_input_processor = MapCompose(str)
            ownerLoader.default_output_processor = Join(' ')
            for (field, path) in self.owner_jmes_paths.items():
                ownerLoader.add_value(field, SelectJmes(path)(c['owner']))
            owner = ownerLoader.load_item()

            item['owner'] = owner
            yield item

            # 开始提取用户信息
            uid = owner['id']
            # https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=1000&category=3&uid=6626771620&pid=-24(创建的组合)
            createdCubeUrl = f'https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=1000&category=3&uid={uid}&pid=-24'
            #  请求用户创建的组合
            # 通过cb_kwargs的方式,给解析函数传递参数
            yield scrapy.Request(
                createdCubeUrl,
                self.parseCubeList,
                headers=self.send_headers,
                cb_kwargs=dict(uid=uid, screen_name=owner['screen_name']))

            # 请求用户关注的组合,这个地方不去传递uid和screen_name信息,这种情况下,通过请求网页去解析,
            # TODO 请求网页的速度超慢,想办法优化,开启多线程?
            followedCubeUrl = f'https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=1000&category=3&uid={uid}&pid=-120'
            yield scrapy.Request(followedCubeUrl,
                                 self.parseCubeList,
                                 headers=self.send_headers)

            # 组合信息:
            # https://xueqiu.com/cubes/quote.json?code=ZH976766,SP1034535,SP1012810,ZH1160206,ZH2003755,ZH1996976,ZH1079481,ZH1174824,ZH1079472,SP1040320

        page = jsonresponse['page']
        maxPage = jsonresponse['maxPage']
        if (page < maxPage):
            url = f'{self.cube_discover_url}{page+1}'
            yield scrapy.Request(url, headers=self.send_headers)
コード例 #2
0
    def parse(self, response, **kwargs):
        """
        response.xpath("//span[contains(@class, 'Head')]//text()").getall()
        :param response:
        :param kwargs:
        :return:
        """
        item_loader = ItemLoader(item=self.item_loader_cls,
                                 response=response,
                                 spider_name=self.name)
        item_loader.default_input_processor = default_input_processor
        item_loader.default_output_processor = default_output_processor

        # logger.info("************************************** TEST **************************************")
        # logger.info("**********************************************************************************")
        # for section in response.xpath("//span[contains(@class, 'Head')]").getall():
        #     logger.info(section.split())
        # logger.info("**********************************************************************************")
        # logger.info("**********************************************************************************")

        for field_name, xpath in self.item_loader_xpath.items():
            item_loader.add_xpath(field_name=field_name, xpath=xpath)

        item_loader.add_value('word', response.request.url.split("/")[-1])

        yield item_loader.load_item()
コード例 #3
0
 def parse_page(self, response):
     job_list = response.css('.srp_container > .row')
     for job in job_list:
         job_loader = ItemLoader(JobPost(), selector=job)
         job_loader.default_input_processor = MapCompose(
             lambda x: x.strip())
         job_loader.default_output_processor = TakeFirst()
         job_loader.add_css('job_title', '.content > ul > .desig::text')
         job_loader.add_css('experience_required', '.content > .exp::text')
         job_loader.add_css('location', '.content > .loc > span::text')
         job_loader.add_css('company_name',
                            '.content > .orgRating > .org::text')
         job_loader.add_css('job_description_url', 'div::attr(data-url)')
         job_loader.add_css(
             'key_skills',
             '.content > .more > div[class = "desc"] >span[class = "skill"]::text'
         )
         job_loader.add_css(
             'job_description',
             '.content > .more > span[class = "desc"]::text')
         job_loader.add_css('salary', '.other_details > .salary::text')
         job_loader.add_css(
             'posted_by', '.other_details > .rec_details > .rec_name::text')
         job_loader.add_css(
             'posted_on',
             '.other_details > .rec_details > span[class = "date"]::text')
         yield job_loader.load_item()
コード例 #4
0
ファイル: seminar.py プロジェクト: whirlp00l/Seminavi
	def parse(self, response):
		match = re.search('/displaySeminarList/',response.url)

		if match:
			urls = response.xpath('//div[@class="internList splitEntry"]//@href').extract()
			for url in urls:
				url = response.urljoin(url)
				yield scrapy.Request(url, self.parse)
		else:
			table = response.xpath(self.seminar_list_xpath)
			corpId = parse_qs(urlparse(response.url).query)['corpId']
			for index,semi in enumerate(table):
				loader = ItemLoader(SeminarItem(),semi)
				loader.default_input_processor = MapCompose(unicode.strip)
				loader.default_output_processor = Join()
				loader.add_value('companyid',corpId)
				loader.add_xpath('name','//div[@id="headerWrap"]//h3/text()')
				loader.add_xpath('date','.//td[@class="date"]/text()',re='\d+\/\d+\/\d+')
				loader.add_xpath('time','.//td[@class="time"]/text()')
				loader.add_xpath('area','.//td[@class="area"]/text()')
				loader.add_xpath('place','.//td[@class="place"]/text()')
				loader.add_xpath('loc_n','.//td[@class="place"]//a', re='mycom_loc\|(\d+\/\d+\/\d+\.\d+)\,\d+\/\d+\/\d+\.\d+')
				loader.add_xpath('loc_e','.//td[@class="place"]//a', re='mycom_loc\|\d+\/\d+\/\d+\.\d+\,(\d+\/\d+\/\d+\.\d+)')
				loader.add_xpath('target','.//td[@class="target"]/text()')
				yield loader.load_item()
コード例 #5
0
def parse_subjects(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(),
                                            replace_escape_chars)
    il.default_output_processor = Join()
    il.add_xpath('SUBJECTS', '//meta[@name="DC.subject.keyword"]/@content')
    return il.load_item()
コード例 #6
0
ファイル: nm_sos.py プロジェクト: prasanthanirudh/scraping
 def save_to_csv(self, response, **meta):
     # self.state['items_count'] = self.state.get('items_count', 0) + 1
     il = ItemLoader(item=NmSosSpiderItem(), response=response)
     il.default_input_processor = MapCompose(lambda v: v.strip(),
                                             remove_tags,
                                             replace_escape_chars)
     #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('company_name', meta['company_name'])
     il.add_value('entity_id', meta['business_id'])
     il.add_value('dba_name', meta['dba_name'])
     il.add_value('company_subtype', meta['company_subtype'])
     il.add_value('non_profit_indicator', meta['non_profit_indicator'])
     il.add_value('location_address_string',
                  meta['location_address_string'])
     il.add_value('status', meta['status'])
     il.add_value('creation_date', meta['creation_date'])
     il.add_value('domestic state', meta['domestic_state'])
     il.add_value('period of duration', meta['peroid_of_duration'])
     il.add_value('business purpose', meta['business_purpose'])
     il.add_value('mixed_subtype', meta['officer_title'])
     il.add_value('mixed_name', meta['officer_name'])
     il.add_value('person_address_string', meta['officer_address'])
     il.add_value('permit_type', 'business_license')
     il.add_value('sourceName', 'NM_SOS')
     il.add_value(
         'url',
         'https://portal.sos.state.nm.us/BFS/online/CorporationBusinessSearch'
     )
     return il
def parse_corporate_bodies(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
    il.default_output_processor = Join()
    il.add_xpath('CORPORATE_BODIES', '//a[contains(@class, "bigblack")]//text()')
    il.add_xpath('CORPORATE_BODIES', '//p[contains(@class, "bodytext")]//a/text()')
    return il.load_item()
def parse_description(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
    il.default_output_processor = Join()
    il.add_xpath('DESCRIPTION', '//h1[contains(@class, "parchment")]//text()')
    il.add_xpath('DESCRIPTION', '//td[contains(@class, "tabbody")]//ul/li/text()')
    return il.load_item()
コード例 #9
0
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        Testing contracts:
        @url http://www.allocine.fr/films
        @returns items
        @scrapes title

        """
        selector = Selector(response)

        # iterate over movies
        for movie in selector.xpath(self.movies_list_xpath):
            loader = ItemLoader(AllocineMovies(), selector=movie)

            # define processors
            loader.default_input_processor = MapCompose(
                lambda x: x.strip(',').split(), replace_escape_chars)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.items():
                loader.add_xpath(field, xpath)
            yield loader.load_item()
コード例 #10
0
 def parse_item(self, response):
     item_loader = ItemLoader(item=BwtTestItem(), response=response)
     item_loader.default_input_processor = MapCompose(str.strip)
     item_loader.default_output_processor = Join()
     for field_name in BwtTestItem.fields.keys():
         item_loader.add_xpath(field_name, self.__get_selector(field_name))
     yield item_loader.load_item()
コード例 #11
0
    def parse_book(self, response):
        book_loader = ItemLoader(item=BookItem(), response=response)
        book_loader.default_input_processor = MapCompose(remove_tags)

        book_loader.add_value(
            "image_urls",
            response.urljoin(
                response.css(".item.active > img::attr(src)").extract_first()))

        book_loader.add_css("title", ".col-sm-6.product_main > h1",
                            TakeFirst())
        book_loader.add_css("price", ".price_color", TakeFirst())
        book_loader.add_css("upc",
                            ".table.table-striped > tr:nth-child(1) > td",
                            TakeFirst())
        book_loader.add_css("product_type",
                            ".table.table-striped > tr:nth-child(2) > td",
                            TakeFirst())
        book_loader.add_css("tax",
                            ".table.table-striped > tr:nth-child(5) > td",
                            TakeFirst())
        book_loader.add_css("stock",
                            ".table.table-striped > tr:nth-child(6) > td",
                            TakeFirst())
        book_loader.add_css("reviews",
                            ".table.table-striped > tr:nth-child(7) > td",
                            TakeFirst())
        book_loader.add_css("rating", ".star-rating::attr(class)", TakeFirst())
        return book_loader.load_item()
コード例 #12
0
 def save_to_csv(self, response, **meta):
     il = ItemLoader(item=IlDupageFoodInspectionsSpiderItem(),
                     response=response)
     il.default_input_processor = MapCompose(lambda v: v.strip(),
                                             remove_tags,
                                             replace_escape_chars)
     #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'IL_Dupage_Food_Inspections')
     il.add_value('url', 'https://eco.dupagehealth.org/#/pa1/search')
     il.add_value(
         'location_address_string', meta['location_address_string']
         if meta['location_address_string'] else 'IL')
     il.add_value('abate_date', meta['abate_date'])
     il.add_value('inspection_date', meta['inspection_date'])
     il.add_value(
         'company_name',
         self._getDBA(meta['company_name'])[0]
         if meta['company_name'] else '')
     il.add_value('violation_type', meta['violation_type'])
     il.add_value('violation_description', meta['violation_description'])
     il.add_value(
         'dba_name',
         self._getDBA(meta['company_name'])[1]
         if meta['company_name'] else '')
     il.add_value('inspection_type', meta['inspection_type'])
     il.add_value('violation_date', meta['violation_date'])
     il.add_value('abate_status', meta['abate_status'])
     il.add_value('inspection_subtype', meta['inspection_subtype'])
     il.add_value('violation_rule', meta['violation_rule'])
     return il.load_item()
コード例 #13
0
def parse_description(response):
    il = ItemLoader(item=TnaMediaWebcrawlerItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
    il.default_output_processor = Join()
#    il.add_xpath('DESCRIPTION', '//div[contains(@class, "entry-content clearfix")]/p//text()')
    il.add_xpath('DESCRIPTION', '//ul[contains(@id, "archive")]//p//text()')
    return il.load_item()
コード例 #14
0
    def parse(self, response):
        selector = Selector(response)

        # iterate over deals
        for thread in selector.xpath(self.thread_list_xpath):
            print("Thread: " + str(thread))
            #print("Link to Follow: " + ''.join(thread.xpath('.//a[@class="title"]/@href').extract()))
            loader = ItemLoader(item=Thread(), selector=thread)
            # define processors
            loader.default_input_processor = MapCompose(str.strip)
            loader.default_output_processor = Join()

            thread_url = 'http://www.clubsnap.com/forums/' + ''.join(
                thread.xpath('.//a[@class="title"]/@href').extract())

            thread_detail = scrapy.Request(thread_url,
                                           callback=self.parse_details)

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.items():
                print("Load: " + field + " " + xpath)
                loader.add_xpath(field, xpath)
            yield loader.load_item()

        # follow next page links
        if response.css("a[rel='next']::attr(href)").extract():
            next_page = 'http://www.clubsnap.com/forums/' + response.css(
                "a[rel='next']::attr(href)").extract()[0]
            print("NEXT PAGE: " + next_page)

            yield scrapy.Request(next_page,
                                 callback=self.parse,
                                 dont_filter=True)
コード例 #15
0
    def parse(self, response):

        #First, we define the selector
        print("response", response)
        print("type response", type(response))
        selector = Selector(response)
        print("selector", selector)
        #print("len selector offer", len(selector.xpath(self.offer_list_xpath)))

        #Second, we define the offers urls
        #job_urls = response.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "title", " " ))]/a/@href').extract()

        # iterate over deals
        for offer in selector.xpath(self.offer_list_xpath):
            #print("offer", offer)
            loader = ItemLoader(IndeedOffer(), selector=offer)

            # define processors
            loader.default_input_processor = MapCompose(str.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.items():
                #print("field", field)
                #print("xpath", xpath)
                loader.add_xpath(field, xpath)
            yield loader.load_item()
コード例 #16
0
def parse_description(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(),
                                            replace_escape_chars)
    il.default_output_processor = Join()
    il.add_xpath('DESCRIPTION', '//div[contains(@class, "breather")]/p/text()')
    il.add_xpath('DESCRIPTION',
                 '//div[contains(@class, "breather")]//ul/li/text()')
    il.add_xpath(
        'DESCRIPTION',
        '//table[contains(@class, "table table-striped")]//tbody/tr/td/text()')
    il.add_xpath('DESCRIPTION',
                 '//div[contains(@class, "accordion-content")]//p/text()')
    il.add_xpath(
        'DESCRIPTION',
        '//div[contains(@class, "two-thirds pad-horizontal-large margin-none margin-bottom-large")]//p/text()'
    )
    il.add_xpath('DESCRIPTION',
                 '//div[contains(@class, "video-box")]//p/text()')
    il.add_xpath('DESCRIPTION',
                 '//div[contains(@class, "entry-content")]//h3/text()')
    il.add_xpath(
        'DESCRIPTION',
        '//div[contains(@class, "entry-content clearfix")]//p/text()')
    return il.load_item()
コード例 #17
0
    def get_player_info(self, response):
        loader = ItemLoader(item=NFL_Player_2015(), response=response)
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()[0]
        number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()

        if type(number_and_position) is list:
            number_and_position = number_and_position[0]
            number = number_and_position.split()[0]
            position = number_and_position.split()[1]
        else:
            number = ''
            position = ''

        loader.add_value('number', number)
        loader.add_value('position', position)
        loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
        loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')

        # loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
        # loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')

        yield loader.load_item()
コード例 #18
0
def parse_content(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(),
                                            replace_escape_chars)
    il.default_output_processor = Join()
    il.add_xpath('CONTENT',
                 '//div[contains(@class, "entry-content")]/p//text()')
    return il.load_item()
コード例 #19
0
def parse_title(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
    il.default_output_processor = Join()
    il.add_value('TITLE', 'Design Registers ')
    il.add_xpath('TITLE', '//span[contains(@class, "btprojtoptitle")]/text()')
#    il.add_xpath('TITLE', '//span[contains(@class, "btprojtitle")]/text()')
    return il.load_item()
コード例 #20
0
    def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        #item_loader.add_css("", "")
        #item_loader.add_css("", "")

        yield item_loader.load_item()
コード例 #21
0
def parse_catalogue_reference(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(),
                                            replace_escape_chars)
    il.default_output_processor = Join()
    il.add_xpath('CATALOGUE_REFERENCE',
                 '//span[contains(@class, "catRef")]/text()')
    return il.load_item()
コード例 #22
0
    def scrape_product(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        item_loader.default_output_processor = TakeFirst()

        item_loader.add_css("my_field", "selector")

        return item_loader.load_item()
コード例 #23
0
 def populate_item(self, selector):
     item_loader = ItemLoader(item=MySpiderItem(), selector=selector)
     item_loader.default_input_processor = MapCompose(remove_tags)
     item_loader.default_output_processor = TakeFirst()
     #
     #item_loader.add_css("my_field", "my_css")
     #item_loader.add_xpath("my_field", "my_xpath")
     #
     return item_loader.load_item()
コード例 #24
0
    def parse_shop(self, response):
        shop_loader = ItemLoader(item=ExclusiveScraperItem(), response=response)
        shop_loader.default_input_processor = MapCompose(remove_tags, replace_escape_chars, strip_html5_whitespace)
        shop_loader.default_output_processor = TakeFirst()

        shop_loader.add_css("title", ".grid__item.large-up--one-third.product__selector-container > h1")
        shop_loader.add_css("price", "span.product__price.js-product-price")
        shop_loader.add_css("discount_price", "span.product__price.product__discount.js-product-price")
        yield shop_loader.load_item()
コード例 #25
0
def parse_description(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
    il.default_output_processor = Join()
    il.add_xpath('DESCRIPTION', '//span[contains(@class, "btprojtxt")]/text()')
    il.add_xpath('DESCRIPTION', '//p[contains(@class, "btprojtxt")]/text()')
#    il.add_xpath('DESCRIPTION', '//a[contains(@class, "btprojlinks")]/text()')
#    il.add_xpath('DESCRIPTION', '//span[contains(@class, "btprojtoptitle")]/text()')
    return il.load_item()
コード例 #26
0
def parse_keywords(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(),
                                            replace_escape_chars)
    il.default_output_processor = Join()
    il.add_xpath('KEYWORDS', '//meta[@name="keywords"]/@content')
    il.add_xpath('KEYWORDS', '//meta[@name="Keywords"]/@content')
    il.add_xpath('KEYWORDS', '//META[@NAME="Keywords"]/@CONTENT')
    return il.load_item()
コード例 #27
0
 def parse(self, response):
     item_loader = ItemLoader(item=MyItem(), response=response)
     item_loader.default_input_processor = MapCompose(remove_tags)
     item_loader.default_output_processor = TakeFirst()
     #
     #item_loader.add_css("my_field", "my_css")
     #item_loader.add_xpath("my_field", "my_xpath")
     #
     return item_loader.load_item()
コード例 #28
0
    def parse_depth_chart(self, response):
        loader = ItemLoader(item=NFL_Team_2015(), response=response)
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        loader.add_xpath("division", '//*[@id="sub-branding"]/div[2]/text()')
        loader.add_xpath("name", '//*[@id="sub-branding"]/h2/a/b/text()')

        yield loader.load_item()
コード例 #29
0
ファイル: xueqiu.py プロジェクト: hexj/starsea
 def parse_cube_info(self, response, symbol_list):
     json_response = json.loads(response.body_as_unicode())
     for s in symbol_list:
         loader = ItemLoader(item=CubeItem())
         loader.default_input_processor = MapCompose(str)
         loader.default_output_processor = Join(' ')
         for (field, path) in self.jmes_paths.items():
             loader.add_value(field, SelectJmes(path)(json_response[s]))
         item = loader.load_item()
         yield item
コード例 #30
0
 def save_csv(self,response,data_dic):
     il = ItemLoader(item=AlFoodInspectionsSpiderItem(),response=response)
     il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags,lambda data:re.sub(r'\s+', ' ',data) if data else '',replace_escape_chars)         
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'AL_Food_Inspections')
     il.add_value('url', 'http://www.alabamapublichealth.gov/foodscores/index.html')       
     
     for k in data_dic:
         il.add_value(k,(data_dic[k]))
     return il
コード例 #31
0
 def parse_raw(self, symbol, raw):
     terms = BranchSettlementItem.Meta.fields
     loader = ItemLoader(item=BranchSettlementItem())
     loader.default_input_processor = MapCompose(str, str.strip)
     loader.default_output_processor = TakeFirst()
     loader.add_value('date', self.date)
     loader.add_value('code', symbol)
     for idx, field in enumerate(terms):
         loader.add_value(field, raw[idx])
     return loader.load_item()
コード例 #32
0
 def parse(self, response):
     item_loader = ItemLoader(item=MyItem(), response=response)
     item_loader.default_input_processor = MapCompose(remove_tags)
     #item_loader.add_css("", "")
     #item_loader.add_css("", "")
     #item_loader.add_css("", "")
     yield FormRequest("POST_URL",
                       formdata={'parameter': 'p'},
                       meta={'item': item_loader.load_item()},
                       callback=self.populate_field)
コード例 #33
0
def getDescription(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
    il.default_output_processor = Join()
    il.add_xpath('DESCRIPTION', '//div[contains(@class, "breather")]/p//text()')
    il.add_xpath('DESCRIPTION', '//div[contains(@class, "breather")]/ul/li//text()')
    il.add_xpath('DESCRIPTION', '//div[contains(@id, "col starts-at-full ends-at-two-thirds clr")]/p//text()')
    il.add_xpath('DESCRIPTION', '//div[contains(@id, "col starts-at-full ends-at-half clr")]/p//text()')
    il.add_xpath('DESCRIPTION', '//div[contains(@class, "col starts-at-full ends-at-half clr")]/p//text()')
    return il.load_item()
コード例 #34
0
    def parse_auction_item(self, response):
        
        loader = ItemLoader(AuctionItems(), response=response)

        loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
        loader.default_output_processor = Join()
       
        for field, xpath in auction_item_fields.iteritems():
            loader.add_xpath(field, xpath)        

              
        yield loader.load_item()
コード例 #35
0
    def parse_bids(self, response):
        
        selector = Selector(response)
        
        for bid in selector.select(self.bid_list_xpath) :
            loader = ItemLoader(BidItems(), selector=bid)

            loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
            loader.default_output_processor = Join()
       
            for field, xpath in auction_bid_fields.iteritems():
                loader.add_xpath(field, xpath)        

              
            yield loader.load_item()
コード例 #36
0
    def parse(self, response):
        def strip_dollar(x):
            return x.strip('$')




        self.driver.get(response.url)
        try:
            WebDriverWait(self.driver, 15).until(
                EC.presence_of_element_located(
                    (By.XPATH,
                        '//*[@id="depart-container"]/div[2]/div[1]/div/[@style="width: 0%;"]')))
        except TimeoutException:
            print 'Page load time out'
            pass

        while True:
            try:
                try:
                    WebDriverWait(self.driver, 15).until(
                        EC.presence_of_element_located(
                            (By.XPATH,
                                '//*[@id="depart-container"]/div/div/div/button')))
                except TimeoutException:
                    break

                next = self.driver.find_element_by_xpath(
                    '//*[@id="depart-container"]/div/div/div/button')
                next.click()

            except ElementNotVisibleException:
                break
        for trips in Selector(
                text=self.driver.page_source).xpath(self.trips_list_xpath):
            loader = ItemLoader(BusTrip(), selector=trips)

            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            loader.price_in = MapCompose(strip_dollar)


            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            dateoftrip = str(response.url).split("/")[-1]
            loader.add_value('dateoftrip', dateoftrip.decode('unicode-escape'))
            yield loader.load_item()
コード例 #37
0
ファイル: zhihu_spider.py プロジェクト: naity/zhihu_scraper
    def parse_answers(self, response):
        # use selector to extract answers
        selector = Selector(response)

        # iterate over answers
        for answer in selector.xpath(self.answers_list_xpath):
            loader = ItemLoader(item=ZhihuAnswer(), selector=answer)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            item = loader.load_item()

            # convert the full text of answer into html
            item["answer"] = item["answer"].encode('ascii', 'xmlcharrefreplace')

            # if summary has image, convert it to html
            if "summary_img" in item:
                item["summary_img"] = item["summary_img"].encode('ascii', 'xmlcharrefreplace')
            else:
                item['summary_img'] = ""

            # change vote to integer
            item["vote"] = int(item["vote"])

            # in case of anonymous authors
            if "author" not in item:
                item["author"] = u'匿名用户'

            # complete links
            item["question_link"] = u"http://www.zhihu.com" + item["question_link"]

            if "author_link" in item:
                item["author_link"] = u"http://www.zhihu.com" + item["author_link"]
            else:
                item["author_link"] = ""

            # add the date when scraped
            item["date"] = date.today()

            yield item