Python Selector.startswith Examples

Programming Language: Python

Namespace/Package Name: scrapy.selector

Class/Type: Selector

Method/Function: startswith

Examples at hotexamples.com: 3

Python Selector.startswith - 3 examples found. These are the top rated real world Python examples of scrapy.selector.Selector.startswith extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Selector(30)

strip(30)

split(30)

css(30)

select(30)

replace(30)

extract(30)

re(30)

extract_first(19)

remove_namespaces(16)

index(9)

rstrip(9)

lstrip(9)

encode(8)

register_namespace(6)

find(5)

remove(4)

append(4)

startswith(3)

rindex(3)

extend(3)

get(3)

re_first(2)

getall(2)

lower(2)

pop(1)

partition(1)

extract_unquoted(1)

__getattribute__(1)

rfind(1)

items(1)

decode(1)

find_all(1)

group(1)

__len__(1)

title(1)

to_csv(1)

url(1)

Example #1

Show file

    def parse(self, response):

        self.host = get_domain_root(response.url)

        datas = response.css(self.site["robot_url_tag"]).extract()

        if datas == None:
            self.log("articles中的robot_url_tag选择器写的不正确", level=logging.ERROR)
            return False

        # 如果获取到内容
        for data in datas:

            title = Selector(text=data).css('a::text').extract_first()
            url = Selector(text=data).css('a::attr(href)').extract_first()

            if not url.startswith('http'):
                url = "http://" + self.host + "/" + url

            # 保存当前
            item = ArticlesItem()
            item['title'] = title
            item['url'] = url
            item['status'] = 0

            yield item
        else:

            self.log("articles的url:%s采集完毕" % response.url, level=logging.INFO)

Example #2

Show file

    def contents(self, response):

        # 获取页面内容
        datas = response.css(self.contents_conf["robot_url_tag"]).extract()
        if datas == None:
            self.log("contents中的robot_url_tag选择器写的不正确", level=logging.ERROR)
            return False

        c_index = 0
        for data in datas:

            title = Selector(text=data).css('a::text').extract_first()
            url = Selector(text=data).css('a::attr(href)').extract_first()
            onclick_url = Selector(text=data).css('a::attr(onclick)').re(
                r'\w\(\'(.*)\'\)')  # .extract()

            # 如果是JS跳转的就要用JS传的参数,否则用href属性
            url = onclick_url[0] if onclick_url != [] else url

            # 如果是http://xxxxx全路径就不用处理了,否则要加上前面的url
            url = (response.url + url) if not url.startswith('http') else url

            content = {}
            content['article_name'] = self.article_name
            content['content_name'] = title
            content['index'] = c_index

            yield scrapy.Request(url, callback=self.details, meta=content)

            c_index = c_index + 1

        else:

            self.log("contents的url:%s采集完毕" % response.url, level=logging.INFO)

Example #3

Show file

File: Scraper.py Project: noystl/Needle-Ex1

 def __extract_days_to_go(self, html):
     time_type = Selector(text=html).xpath(self.TIME_TYPE_XPATH).get()
     time_left = Selector(text=html).xpath(self.TIME_LEFT_XPATH).get()
     if time_type.startswith('hours'):
         return str(float(time_left) / self.HOURS_IN_DAY)
     return time_left