Python LinkExtractor.LinkExtractorの例、scrapy.contrib.linkextractors.LinkExtractor.LinkExtractor Pythonの例

コード例 #1

0

ファイルを表示

class RedditSpider(CrawlSpider):
    name = "reddit"
    allowed_domains = ["openjur.de"]
    start_urls = ('http://openjur.de/u.html', )
    rules = [
        Rule(LinkExtractor(allow=["\/u\/[0-9]+\.html"]),
             callback="parse_item"),
        Rule(LinkExtractor(allow=["\/u(\-[0-9]+)?\.html"]))
    ]

    def parse_item(self, response):

        #print (response.css("div[ id=\"info\"]").extact())
        item = PicItem()
        item['url'] = response.url

        item['gericht'] = response.css(
            "#info > ul:nth-child(1) > li:nth-child(1) > p:nth-child(2)").css(
                "a[href*=http]::text").extract()
        item['datum'] = response.css(
            "#info > ul:nth-child(1) > li:nth-child(2) > p:nth-child(2)::text"
        ).extract()
        item["AZ"] = response.css(
            "#info > ul:nth-child(1) > li:nth-child(3) > p:nth-child(2)::text"
        ).extract()
        item['typ'] = response.css(
            "#info > ul:nth-child(1) > li:nth-child(4)  ::text").extract()
        item['text'] = response.css("#text").extract()
        item['verfahrensgang'] = [
            " ".join(
                response.css(
                    ".instanzen > p:nth-child(2) > a:nth-child(1)::text").
                extract() + (response.css(
                    ".instanzen > p:nth-child(2) > i:nth-child(2)::text").
                             extract()))
        ]
        item['rechtsgebiete'] = response.css(".rechtsgebiete").extract()
        print(item['url'])

        yield item

コード例 #2

0

ファイルを表示

ファイル: threads.py プロジェクト: praschky/python_reddit_scraper

class ThreadsSpider(CrawlSpider):
    name = "threads"
    allowed_domains = ["reddit.com"]
    start_urls = (
        "https://www.reddit.com/r/",  #Enter Reddit thread here
    )

    rules = (
        Rule(LinkExtractor(restrict_xpaths=(".//div[@class='nav-buttons']")), follow=True),
        Rule(LinkExtractor(restrict_xpaths=(".//div[@class='content']//p[@class='parent']/a[@class='title']")), callback="parse_item"),
    )


    def parse_item(self, response):
        hxs = Selector(response)

        thread = hxs.xpath(".//p[@class='title']/a/text()").extract()
        op = hxs.xpath(".//div[contains(@class, 'self')]//p[@class='tagline']/a[contains(@class, 'author')]/text()").extract()
        thread_date = hxs.xpath(".//div[contains(@class, 'self')]//p[@class='tagline']/time/@title").extract()
        textpost = hxs.xpath(".//div[contains(@class, 'self')]//div[@class='md']//text()").extract()
        comments = hxs.xpath(".//div[contains(@class, 'self')]//a[contains(@class, 'comments')]/text()").extract()
        vote_points = hxs.xpath(".//div[@class='linkinfo']/div[@class='score']/span[@class='number']/text()").extract()
        upvoted = hxs.xpath(".//div[@class='linkinfo']/div[@class='score']/text()").extract()

        rows = hxs.xpath(".//div[@class='commentarea']//div[contains(@class, 'comment')]/div[contains(@class, 'entry')]")
        for row in rows:
            l = CommentItemLoader(item = CommentItem(), response = response)
            l.add_value("url", response.url)
            l.add_value("thread", thread)
            l.add_value("op", op)
            l.add_value("thread_date", thread_date)
            l.add_value("textpost", textpost)
            l.add_value("comments", comments)
            l.add_value("vote_points", vote_points)
            l.add_value("upvoted", upvoted)
            l.add_value("comment", row.xpath(".//div[contains(@class, 'usertext-body')]//text()").extract())
            l.add_value("user", row.xpath(".//p[@class='tagline']/a[contains(@class, 'author')]/text()").extract())
            l.add_value("time", row.xpath(".//p[@class='tagline']/time/@title").extract())

            yield l.load_item()

コード例 #3

0

ファイルを表示

class LeMondeSpider(CrawlSpider):
    name = "lemonde"
    allowed_domains = ["lemonde.fr"]
    start_urls = [
        "http://www.lemonde.fr/",
    ]

    article_item_fields = {
        'title': './/article/h1/text()',
        #'Author': './/article/p[@class="bloc_signature"]/span[@class="signature_article"]/span[@itemprop="author"]/a.text()',
        #'Publisher': './/article/p[@class="bloc_signature"]/span[@id="publisher"]/text()',
        'timestamp':
        './/article/p[@class="bloc_signature"]/time[@itemprop="datePublished"]/@datetime',
        'body': './/article/div[@id="articleBody"]/*',
    }

    rules = (
        # Rule(LinkExtractor(allow=[r'w+']), follow=True),
        # Extract links matching to the article link
        Rule(LinkExtractor(allow=(r"article/\d{4}/\d{2}/\d{2}/.+")),
             callback="parse_article",
             follow=True), )

    def parse_article(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """

        selector = Selector(response)
        loader = XPathItemLoader(LeMondeArt(), selector=selector)

        self.log('\n\nA response from %s just arrived!' % response.url)

        # define processors
        text_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        # Populate the LeMonde Item with the item loader
        for field, xpath in self.article_item_fields.iteritems():
            try:
                loader.add_xpath(field, xpath, text_input_processor)
            except ValueError:
                self.log("XPath %s not found at url %s" %
                         (xpath, response.url))

        #loader.add_value("Url",response.url)

        yield loader.load_item()

コード例 #4

0

ファイルを表示

class BitcoinAddrSpider(CrawlSpider):
    item = {}
    name = "bitcoinaddr"
    allowed_domains = ["bitcointalk.org"]
    start_urls = ["https://bitcointalk.org/index.php?board=1.0"]

    rules = (
        Rule(LinkExtractor(allow=('board'))),
        Rule(LinkExtractor(allow=('topic')), callback='parse_item'),
    )

    def parse_item(self, response):
        #print response.xpath('//tbody').extract() , "!!!"
        for line in response.css(".signature"):
            #print line.extract(), ' !!! ';
            #_item = BitcoinaddrspiderItem()
            if (line.xpath('text()').re(r"(1[1-9A-HJ-NP-Za-km-z]{26,33})") ==
                []):
                continue
            Addr = line.xpath('text()').re(
                r"(1[1-9A-HJ-NP-Za-km-z]{26,33})")[0]
            print Addr, "123!!!\n"
            Username = line.xpath(
                '../../../tr[1]/td[1]/b/a/text()').extract()[0]
            if Username == "" or Addr == "":
                continue
            try:
                self.item[Username].append(Addr)
                self.item[Username] = list(set(self.item[Username]))
            except KeyError:
                self.item[Username] = [Addr]
            print Username, ' : ', Addr, "!!!!\n"
            #yield item

    def closed(self, reason):
        print self.item, "!!!"
        fileHandle = open('output.json', 'w')
        fileHandle.write(json.dumps(self.item))
        fileHandle.close()

コード例 #5

0

ファイルを表示

class WallpapersSpider(CrawlSpider):
    name = 'wallpapers'
    allowed_domains = ['simpledesktops.com']
    start_urls = ['http://simpledesktops.com/browse/']

    rules = (
        Rule(LinkExtractor(
            allow=
            r'browse/desktops/20[0-9]{2}/(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/[0-9]{2}'
        ),
             callback='parse_item'),
        Rule(LinkExtractor(allow=r'browse/\d\/$'), follow=True),
    )

    def parse_item(self, response):
        i = SimpledesktopsItem()
        i['image_urls'] = [
            'http://{0}/{1}'.format(
                self.allowed_domains[0],
                response.xpath('//div[@class="desktop"]/a/@href').extract()[0])
        ]
        return i

コード例 #6

0

ファイルを表示

ファイル: newArticleItems.py プロジェクト: kashenfelter/Web_Scraping_Py

class ArticleSpider(CrawlSpider):
    name = 'articleItems'
    allowed_domains = ['wikipedia.org']
    start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
    rules = [Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'),callback='parse_items', follow=True)]
    def parse_items(self, response):
        article = Article()
        article['url'] = response.url
        article['title'] = response.css('h1::text').extract_first()
        article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract()
        lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
        article['lastUpdated'] = lastUpdated.replace('This page was last edited on ', '')
        return article

コード例 #7

0

ファイルを表示

class CookingLightSpider(CrawlSpider):
    name = 'myrecipes'
    allowed_domains = ['myrecipes.com']
    start_urls = ["http://myrecipes.com/"]
    rules = (Rule(LinkExtractor(allow=".*/recipe/.*"), callback="parse_item"),
             Rule(
                 LinkExtractor(deny=[
                     ".*/how-to/video/.*", ".*/r/.*", ".*/about-us/.*",
                     ".*/contact-us/.*", ".*/frequently-asked-questions/.*",
                     ".*/press/.*", ".*/rss/.*", ".*/sitemap/.*"
                 ])))

    def __init__(self):
        super(CookingLightSpider, self).__init__()
        self.seen_recipes = set()

    def parse_item(self, response):
        item = RecipeItem()
        item['url'] = response.url
        if item['url'] not in self.seen_recipes:
            self.seen_recipes.add(item['url'])
            return item

コード例 #8

0

ファイルを表示

ファイル: cooks.py プロジェクト: DickJ/W205-Final-Project

class CooksSpider(CrawlSpider):
    name = 'cooks'
    USER_AGENT = "Mozilla/5.0"
    allowed_domains = ['cooks.com']
    start_urls = ["http://www.cooks.com/rec/browse/"]
    rules = (
        Rule(LinkExtractor(allow=".*/recipe/[\w\d]{8}/[\w\d-]+\.html"),
              callback='parse_item'),
        Rule(LinkExtractor(allow=".*/rec/new_recipes_\w+\.html"))
    )

    def __init__(self):
        super(CooksSpider, self).__init__()
        self.seen_recipes = set()

    def parse_item(self, response):
        item = RecipeItem()
        item['url'] = response.url
        item['url'] = re.sub("\?.*", "", item['url'])
        if item['url'] not in self.seen_recipes:
            self.seen_recipes.add(item['url'])
            return item

コード例 #9

0

ファイルを表示

class AIweeklySpider(CrawlSpider):
    name = "aiweekly"
    allowed_domains = ["aiweekly.co"]
    start_urls = [
        "http://aiweekly.co/issues/1#start",
    ]
    rules = [
        Rule(LinkExtractor(allow=r"/issues/[0-9]+#start"), "parse_item"),
        Rule(LinkExtractor(allow=r"/issues/[0-9]+"), "parse_item"),
    ]

    def parse_item(self, response):

        item = AiweeklyItem()

        item['title'] = response.xpath(
            r"//h3[@class='item__title']/a/text()").extract()

        item['link'] = response.xpath(
            r"//h3[@class='item__title']/a/@href").extract()

        return item

コード例 #10

0

ファイルを表示

 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         url = urljoin(response.url, link.url)
         yield scrapy.Request(
             url,
             self.parse_link,
             meta={'splash': {
                 'args': {
                     'har': 1,
                     'html': 0
                 },
             }})

コード例 #11

0

ファイルを表示

ファイル: crawler.py プロジェクト: shen-yang/geeksforgeeks-as-books

class GFGSpider(CrawlSpider):
    name = 'geeksforgeeks'
    allowed_domains = ['geeksforgeeks.org']
    rules = (
        Rule(LinkExtractor(restrict_xpaths=('//a[@class="nextpostslink"]', )),
             follow=True),
        Rule(LinkExtractor(restrict_xpaths=('//h2[@class="post-title"]/a', )),
             callback='parse_item'),
    )

    def __init__(self,
                 category='tag',
                 name='dynamic-programming',
                 *args,
                 **kwargs):
        super(GFGSpider, self).__init__(*args, **kwargs)
        self.dest = "../geeksforgeeks-books/" + name + "/"
        self.start_urls = [
            'http://www.geeksforgeeks.org/' + category + '/' + name
        ]
        self.doc_name = name

    def parse_item(self, response):
        self.log('Hi, this is an item page! %s' % response.url)
        #item = GeeksforgeeksItem()
        #item['url'] = response.url
        if not os.path.exists(self.dest):
            os.makedirs(self.dest)
        with open(self.dest + 'metadata.xml', 'w') as metadata:
            metadata.write(
                '<dc:title>' + " ".join(self.doc_name.title().split('-')) +
                '</dc:title>\n<dc:language>en-US</dc:language>\n<dc:date opf:event="publication">2015-2-19</dc:date>\n<dc:rights>Creative Commons Attribution-NonCommercial-NoDerivs 2.5 India (CC BY-NC-ND 2.5 IN)</dc:rights>'
            )
        """
        call(['wget', '-O', self.dest + response.url.split('/')[-2] + ".html", response.url])
        """
        with codecs.open(self.dest + response.url.split('/')[-2] + ".html",
                         'w', 'utf-8') as file_handle:
            file_handle.write(response.body_as_unicode())

コード例 #12

0

ファイルを表示

class PicSpider(CrawlSpider):
    name = "pic"
    allowed_domains = ["http://www.reddit.com"]
    start_urls = (
        'http://www.reddit.com/r/pics'
    )

    rules = [
    	Rule(LinkExtractor(allow=['.*']))
    ]
#'/r/pics/\?count=\d*&after=\w*
    def parse(self, response):
        pass

コード例 #13

0

ファイルを表示

class TechSpider(CrawlSpider):
    name = "news"
    allowed_domains = ["tech.163.com"]
    start_urls = ["http://tech.163.com/"]
    rules = [Rule(LinkExtractor(allow=("/17/\d+/\d+/*")), 'parse_item')]

    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        ext = Extractor(rawPage=response.text, blockSize=5, image=False)
        print ext.getContext()
        item['content'] = ext.getContext()
        yield item

コード例 #14

0

ファイルを表示

ファイル: spoonful.py プロジェクト: DickJ/W205-Final-Project

class SpoonfulSpider(CrawlSpider):
    name = 'spoonful'
    allowed_domains = ['spoonful.comm', 'family.disney.com', 'disney.com']
    start_urls = ["http://family.disney.com/recipes"]
    rules = (
        Rule(LinkExtractor(allow=".*/recipes/page/.*")),
        Rule(LinkExtractor(allow=".*/recipes/.*",
                           deny=[".*/recipes/page/.*"]),
             callback='parse_item')

    )

    def __init__(self):
        super(SpoonfulSpider, self).__init__()
        self.seen_recipes = set()

    def parse_item(self, response):
        item = RecipeItem()
        item['url'] = response.url
        if item['url'] not in self.seen_recipes:
            self.seen_recipes.add(item['url'])
            return item

コード例 #15

0

ファイルを表示

ファイル: xpath_spider.py プロジェクト: LunaBlack/the_spider

    def __init__(self):
        rs = ReadSetting() #读取各项参数
        self.start_urls = rs.readurl()
        self.linkmatrix = LinkMatrix(rs.projectname())
        self.linkmatrix.setroot(self.start_urls)

        self.allowed_domains = rs.readalloweddomain()
        self.xpath = rs.readxpath()
        self.rules = [Rule(LinkExtractor(), follow=True, callback="parse_start_url")]
        #设置爬取规则:follow所有url;Request通过spidermiddlewares过滤掉限定域外的url;生成的response传递给parse_start_url
        #所有Request均经过spidermiddlewares

        super(XpathSpider, self).__init__()

コード例 #16

0

ファイルを表示

ファイル: spider.py プロジェクト: dekoder/LinkSpider

class UrlsSpider(CrawlSpider):
    name = "urls"
    allowed_domains = domains
    start_urls = [
        start_link,
    ]

    rules = (
        Rule(LinkExtractor(allow=()), callback='parse_item', follow = True),
    )

    def parse_item(self, response):
        pass

コード例 #17

0

ファイルを表示

class ImgurScrappingSpider(CrawlSpider):
	name = 'ImgurScrapping'
	allowed_domains = ['imgur.com']
	start_urls = ['http://www.imgur.com']
	rules = [Rule(LinkExtractor(allow=['/gallery/.*']), 'parse_imgur')]

	def parse_imgur(self, response):
		image = ImgurscrappingItem()
		image['title'] = response.xpath(\
				"//h1/text()").extract()
		relative_address = response.xpath("//img/@src").extract()
		image['image_urls'] = ['http:' + relative_address[0]]
		return image

コード例 #18

0

ファイルを表示

ファイル: steamykitchen.py プロジェクト: DickJ/W205-Final-Project

class SteamyKitchenSpider(CrawlSpider):
    name = 'steamykitchen'
    USER_AGENT = "Mozilla/4.0"
    allowed_domains = ['steamykitchen.com']
    start_urls = ["http://steamykitchen.com/category/recipes"]
    rules = (
        Rule(LinkExtractor(allow=".*\.com/\d+-.*\.html"),
              callback='parse_item'),
        Rule(LinkExtractor(allow=".*/category/recipes/page/\d+"))
    )

    def __init__(self):
        super(SteamyKitchenSpider, self).__init__()
        self.seen_recipes = set()

    def parse_item(self, response):
        item = RecipeItem()
        item['url'] = response.url
        item['url'] = re.sub("\?.*", "", item['url'])
        if item['url'] not in self.seen_recipes:
            self.seen_recipes.add(item['url'])
            return item

コード例 #19

0

ファイルを表示

ファイル: drinks_spider.py プロジェクト: zaneu/cs373-idb

class DrinkSpider(CrawlSpider):
    """
    the DrinkSpider
    """

    name = "drinks"
    allowed_domains = ["drinksmixer.com"]

    start_urls = [
        "http://www.drinksmixer.com/cat/1/%d" % p for p in xrange(1, 125)
    ]

    rules = (Rule(LinkExtractor(allow='http://www\.drinksmixer\.com/.*\.html'),
                  callback='parse_drink'), )

    def parse_drink(self, response):
        """
        parsing the drinks
        note that this function has no notion of what the drink is (cocktail, shot)
        """

        drink = DrinkItem()
        soup = BeautifulSoup(response.body, "lxml")

        title = soup.find("title").contents[0].rstrip("recipe")
        recipe = soup.find("div", {"class": "RecipeDirections instructions"})
        description = soup.find("div", {"class": "summary RecipeDirections"})
        ingredients = soup.find("div", {"class": "ingredients"})

        drink['ingredients'] = {}

        if ingredients:
            for ingredient in ingredients.find_all("span",
                                                   {"class": "ingredient"}):
                if ingredient.contents:
                    value = ingredient.find("span", {
                        "class": "amount"
                    }).contents[0]
                    key = ingredient.find("span", {"class": "name"})
                    key = key.find("a")
                    key = key['href']
                    key = re.findall(r'\d+', key)[0]

                    drink['ingredients'][key] = value

        drink['name'] = title.rstrip()
        drink['recipe'] = recipe.contents[0].rstrip() if recipe else ""
        drink['description'] = description.contents[0].rstrip(
        ) if description else ""

        return drink

コード例 #20

0

ファイルを表示

ファイル: zz_spider_bagirov.py プロジェクト: ghowa/russian-blogs

class ZZSpider(CrawlSpider):
    name = "zz_bagirov"
    allowed_domains = ["bagirov.livejournal.com"]
    start_urls = ["http://bagirov.livejournal.com/446032.html"]

    rules = (Rule(LinkExtractor(
        allow=('http://bagirov.livejournal.com/\d+\.html', ),
        deny=('tag', 'reply', 'thread', 'page'),
    ),
                  callback='parse_page',
                  follow=True), )

    def parse_start_url(self, response):
        list(self.parse_page(response))

    def parse_page(self, response):

        # use scrapy shell to find xpath
        # from scrapy.shell import inspect_response
        # inspect_response(response)

        item = ScraperItem()
        item['url'] = response.url

        try:
            item['title'] = response.xpath(
                "//div[@class='asset-header-content-inner']/h2/a/text()"
            ).extract()[0]
        except IndexError:
            item['title'] = ""

        try:
            item['text'] = " ".join(
                response.xpath(
                    "//div[@class='asset-content']/child::node()").extract())
        except IndexError:
            item['text'] = ''

        try:
            item['date'] = response.xpath(
                "//abbr[@class='datetime']/text()").extract()[0]
        except IndexError:
            item['date'] = ''

        try:
            item['comment_count'] = response.xpath(
                "//div[@class='comments-nav']/text()").extract()[0]
        except IndexError:
            item["comment_count"] = "0"

        yield item

コード例 #21

0

ファイルを表示

class ForumsSpider(CrawlSpider):
    name = "nosurrender"
    allowed_domains = ["nosurrenderbreastcancersupportforum.com"]
    start_urls = [
        "http://www.nosurrenderbreastcancersupportforum.com/",
    ]

    rules = (
        # Rule to go to the single product pages and run the parsing function
        # Excludes links that end in _W.html or _M.html, because they point to
        # configuration pages that aren't scrapeable (and are mostly redundant anyway)
        Rule(LinkExtractor(
            restrict_xpaths=
            '//td[contains(@valign,"top")]/table[contains(@class,"tables")]//a[contains(@class,"forum")]',
        ),
             callback='internallist'))

    # Rule to follow arrow to next product grid

    # https://github.com/scrapy/dirbot/blob/master/dirbot/spiders/dmoz.py
    # https://github.com/scrapy/dirbot/blob/master/dirbot/pipelines.py
    def internallist(self, response):
        links = response.xpath(
            'id("main_container")/div[2]/form[1]/table/tbody/tr[1]/td/table/tbody/tr/td[2]/a/@href'
        ).extract()
        for link in links:
            yield Request(link, callback=self.parsePostList)

    def parsePostsList(self, response):
        sel = Selector(response)
        html = response.body
        soup = BeautifulSoup(html)
        users = soup.findAll('a', {'class': re.compile('usergroup\d.*')})
        items = []
        topic = response.xpath(
            '//tbody/tr[2]/td[2]/table/tbody/tr[1]/td/div/b').extract()
        url = response.url
        for x in range(len(users)):
            item = PostItemsList()
            item['author'] = users[x].text
            item['author_link'] = users[x]['href']
            item['create_date'] = soup.findAll(
                'span', {'id': re.compile('posted_date_.*')})[x].text
            item['post'] = soup.findAll(
                'span', {'id': re.compile('post_message.*')})[x].text
            item['tag'] = 'cancer'
            item['topic'] = topic
            item['url'] = url
            logging.info(item.__str__)
            items.append(item)
        return items

コード例 #22

0

ファイルを表示

ファイル: zz_spider_pesen-net.py プロジェクト: ghowa/russian-blogs

class ZZSpider(CrawlSpider):
    name = "zz_pesen-net"
    allowed_domains = ["pesen-net.livejournal.com"]
    start_urls = [
        "http://pesen-net.livejournal.com/82406.html"
        # adult: http://pesen-net.livejournal.com/71163.html
        #"http://pesen-net.livejournal.com/70709.html"
    ]

    rules = (Rule(LinkExtractor(
        deny=('tag', 'reply', 'thread', 'page'),
        restrict_xpaths=("//span[@class='entry-linkbar-inner']"),
    ),
                  callback='parse_page',
                  follow=True), )

    def parse_start_url(self, response):
        list(self.parse_page(response))

    def parse_page(self, response):

        # use scrapy shell to find xpath
        #from scrapy.shell import inspect_response
        # inspect_response(response)

        item = ScraperItem()
        item['url'] = response.url

        try:
            item['title'] = response.xpath(
                "//dt[@class='entry-title']/text()").extract()[0]
        except IndexError:
            item['title'] = ""

        item['text'] = " ".join(
            response.xpath(
                "//div[@class='entry-content']/child::node()").extract())

        try:
            item['date'] = response.xpath(
                "//abbr[@class='updated']/text()").extract()[0]
        except IndexError:
            item['date'] = ''

        try:
            item['comment_count'] = response.xpath(
                "//span[@class='comments-count']/text()").extract()[0]
        except IndexError:
            item["comment_count"] = "0"

        yield item

コード例 #23

0

ファイルを表示

class ExampleSpider(CrawlSpider):
    name = "souhunews"
    allowed_domains = ["business.sohu.com"]
    start_urls = ['http://business.sohu.com/']
    rules=(
        Rule(LinkExtractor(allow=r"/20161212+/*"),
        callback="parse_news",follow=True),
    )
    def printcn(suni):
        for i in uni:
            print uni.encode('utf-8')
    def parse_news(self,response):
        item = FinancesouhuItem()
        item['news_thread']=response.url.strip().split('/')[-1][:-6]
        # self.get_thread(response,item)
        self.get_title(response,item)
        self.get_time(response,item)
        self.get_url(response,item)
        self.get_news_from(response,item)
        self.get_text(response,item)
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse
        return item 
    def get_title(self,response,item):
        title=response.xpath("/html/head/title/text()").extract()
        if title:
            # print 'title:'+title[0][:-5].encode('utf-8')
            item['news_title']=title[0][:-5]

    def get_time(self,response,item):
        time=response.xpath("//div[@id='pubtime_baidu']/text()").extract()
        if time:
            # print 'time'+time[0][:-5].encode('utf-8')
            item['news_time']=time[0]

    def get_news_from(self,response,item):
        news_from=response.xpath("//span[@id='media_span']/span/text()").extract()
        if news_from:
            # print 'from'+news_from[0].encode('utf-8')     
            item['news_from']=news_from[0]

    def get_text(self,response,item):
        news_body=response.xpath("//div[@id='contentText']/div[1]/p/text()").extract()
        if news_body:
            # for  entry in news_body:
            #   print entry.encode('utf-8')
            item['news_body']=news_body 
    def get_url(self,response,item):
        news_url=response.url
        if news_url:
            #print news_url 
            item['news_url']=news_url

コード例 #24

0

ファイルを表示

ファイル: tiwag_spider.py プロジェクト: deepgeorge/kalipo

class TiwagSpider(CrawlSpider):
    name = "tiwag"
    allowed_domains = ["dietiwag.org"]
    start_urls = [
        "http://www.dietiwag.org/phorum_2/list.php?f=2",
        "http://www.dietiwag.org/phorum_2/read.php?f=2&i=120470&t=120470"
    ]

    rules = (
        Rule(LinkExtractor(allow=('list.php'))),
        Rule(LinkExtractor(allow=('read\.php', )), callback='parse_item'),
    )

    def clean_str(self, val):
        return val.replace(u'\xa0', u'').replace('\n', '').strip()

    def parse_item(self, response):
        sel = Selector(response)
        comments = sel.xpath('//font[@class="PhorumMessage"]')
        items = []
        for one in comments:
            item = Comment()
            item['url'] = response.url
            texts = one.xpath('text()').extract()
            item['author'] = self.clean_str(texts[0]).replace('Autor:', '')
            item['text'] = self.clean_str(' '.join(texts[2:len(texts)]))
            try:
                item['date'] = calendar.timegm(
                    time.strptime(
                        self.clean_str(texts[1]).replace('Datum:', ''),
                        "%d-%m-%y %H:%M"))
            except ValueError:
                print "Error: " + ' '.join(texts)
                continue

            items.append(item)

        return items

コード例 #25

0

ファイルを表示

class zqbSpider(CrawlSpider):
    name = site_name
    allowed_domains = [
        "cankaoxiaoxi.com",
    ]
    start_urls = url_list

    rules = (Rule(
        LinkExtractor(allow=('/\d{4}/\d{4}/\S*\.shtml')),
        callback='parse_data',
        follow=True,
    ),
             Rule(LinkExtractor(
                 allow=('/history/index/\d{4}-\d{2}/\d{2}-\d{2}\.shtml')),
                  follow=True))

    def parse_data(self, response):
        # get the publish time and store the fils by year/month
        year = response.url.split('/')[3]
        month = response.url.split('/')[4][0:2]
        path = data_dir + '/' + year + '/' + month
        if not os.path.exists(path):
            os.makedirs(path)
        # Get the title
        if response.xpath('//h1/text()').extract():
            title = response.xpath('//h1/text()').extract()[0]
        else:
            title = response.xpath('//h2/text()').extract()[0]
        # get the content
        content_list = response.xpath(
            '//div[@id="ctrlfscont"]//p/text()').extract()
        content = "".join(content_list).strip().encode("utf-8")
        # If the time or the content is empty,Means we get the wrong page
        # Do not create the file
        if title and content:
            filename = path + '/' + title + '.txt'
            with open(filename, 'wb') as f:
                f.write(content)

コード例 #26

0

ファイルを表示

class AdidasSpider(CrawlSpider):
    name = "adidas"
    allowed_domains = ["adidas.com"]
    start_urls = [
        "http://www.adidas.com/us/shoes",
    ]

    rules = (
        # Rule to go to the single product pages and run the parsing function
        # Excludes links that end in _W.html or _M.html, because they point to
        # configuration pages that aren't scrapeable (and are mostly redundant anyway)
        Rule(LinkExtractor(
            restrict_xpaths='//a[contains(@class,"product-link")]',
            deny=('_[WM]\.html', )),
             callback='singleProductParse'),
        # Rule to follow arrow to next product grid
        Rule(LinkExtractor(
            restrict_xpaths='//li[@class="pagging-arrow right-arrow"]'),
             follow=True),
    )

    # Function to parse information from a single product page
    def singleProductParse(self, response):
        item = ProductItem()
        item['brand'] = 'Adidas'
        item['name'] = response.css('.title-32').xpath('text()').extract()[0]
        desc = response.css('.title-16').xpath('text()').extract()[0].strip()
        try:
            item['division'], item['category'] = desc.split(" ", 1)
        except ValueError:
            item['category'] = desc
            item['division'] = 'None'
        item['division'] = item['division'].replace("'s", "")
        item['price'] = response.css('span.sale-price').xpath(
            'text()').extract()[0].strip()
        item['image_link'] = response.css(
            'img.productimagezoomable::attr(src)').extract()[0]
        return item

コード例 #27

0

ファイルを表示

class MemriseSpider(CrawlSpider):
    name = "memrise"
    allowed_domains = ["memrise.com"]
    start_urls = [
        "http://www.memrise.com/login/",
    ]
    rules = (
        Rule(LinkExtractor(
            allow='login',
        ), callback='do_login'),
    )

    def do_login(self, response):
        args, url, method = fill_login_form(response.url, response.body,
            USER, PASSWORD)
        return scrapy.FormRequest(url, method=method, formdata=args,
                callback=self.parse_dashboard)

    def parse_dashboard(self, response):
        # If the course has several level, scrape them
        pagination_selector = '//div[contains(@class, "title")]/a/@href'
        for url in response.xpath(pagination_selector).extract():
            yield scrapy.Request(urljoin('http://www.memrise.com', url), callback=self.parse_level)

    def parse_level(self, response):
        pagination_selector = '//div[contains(@class, "levels")]/a/@href'
        for url in response.xpath(pagination_selector).extract():
            yield scrapy.Request(urljoin('http://www.memrise.com', url), callback=self.parse_level)

        course = response.xpath('//h1[contains(@class, "course-name")]/text()').extract()[0]
        for sel in response.xpath('//div[contains(@class, "thing text-text")]'):
            item = MemriseItem()
            item['course'] = course
            item['item_id'] = sel.xpath('@data-thing-id').extract()[0]
            status = sel.xpath('div/div[contains(@class, "status")]/text()').extract()
            if not status:
                status = "not learnt"
            elif status == ["now"]:
                status = "now"
            elif status == ['in about a day']:
                status = [1, 'day']
            elif status == ['in about an hour']:
                status = [1, 'hour']
            elif status == ['in about a minute']:
                status = [1, 'minute']
            else:
                status = status[0].split()[1:]
                status[0] = int(status[0])
            item['status'] = status
            yield item

コード例 #28

0

ファイルを表示

ファイル: idealista_spider.py プロジェクト: rhuangfinance/Scrapy-Idealista

class IdealistaSpider(CrawlSpider):
    name = "idealista"
    allowed_domains = ["idealista.com"]
    ########################################################################
    ###       Add the url to crawl in the start_urls variable           ###
    ########################################################################
    #start_urls = ["https://www.idealista.com/venta-viviendas/leganes/el-carrascal/"]
    #start_urls = ['https://www.idealista.com/alquiler-viviendas/madrid/zona-norte/']

    start_urls = ['https://www.idealista.com/venta-viviendas/madrid/carabanchel/']

    rules = (
            # Filter all the flats paginated by the website following the pattern indicated
            Rule(LinkExtractor(restrict_xpaths=("//a[@class='icon-arrow-right-after']")),
                 callback='parse_flats',
                 follow=True),
        )

    def parse_flats(self, response):

    	# Necessary in order to create the whole link towards the website
        default_url = 'http://idealista.com'

	info_flats_xpath = response.xpath("//*[@class='item-info-container']")
	prices_flats_xpath = response.xpath("//*[@class='row price-row clearfix']/span[@class='item-price']/text()")
	discounts_xpath = response.xpath("//*[@class='row price-row clearfix']")

        links = [str(''.join(default_url + link.xpath('a/@href').extract().pop()))
                 for link in info_flats_xpath]

        prices = [float(flat.extract().replace('.','').strip())
                 for flat in prices_flats_xpath]

	discounts = [0 if len(discount.xpath("./*[@class='item-price-down icon-pricedown']/text()").extract()) < 1 else discount.xpath("./*[@class='item-price-down icon-pricedown']/text()").extract().pop().replace('.','').strip().split(' ').pop(0) for discount in discounts_xpath]
	
	addresses = [address.xpath('a/@title').extract().pop().encode('iso-8859-1') 
		     for address in info_flats_xpath]
	
	rooms = [int(flat.xpath('span[@class="item-detail"]/small[contains(text(),"hab.")]/../text()').extract().pop().strip()) if len(flat.xpath('span[@class="item-detail"]/small[contains(text(),"hab.")]')) == 1 else None for flat in info_flats_xpath]

	sqfts_m2 = [float(flat.xpath('span[@class="item-detail"]/small[starts-with(text(),"m")]/../text()').extract().pop().replace('.','').strip()) if len(flat.xpath('span[@class="item-detail"]/small[starts-with(text(),"m")]')) == 1 else None for flat in info_flats_xpath]

	for flat in zip(links, prices, addresses, discounts, sqfts_m2, rooms):
            item = IdealistaItem(date=datetime.now().strftime('%Y-%m-%d'),
				 link=flat[0], price=flat[1], address=flat[2], discount=flat[3], 
                                 sqft_m2=flat[4], rooms=flat[5])
            yield item

    #Overriding parse_start_url to get the first page
    parse_start_url = parse_flats

コード例 #29

0

ファイルを表示

class PttSpider(CrawlSpider):
    name = "playspider"
    allowed_domains = ["play.google.com"]
    start_urls = [
       "https://play.google.com/store/apps/details?id=mobi.infolife.ezweather.widget.batteryandweather",
        "https://play.google.com/store/apps/details?id=mobi.infolife.ezweather.widget.localweatherapp",
       #  "https://play.google.com/store/apps/details?id=mobi.infolife.ezweather"

    ]
    rules = [
        Rule(LinkExtractor(allow=(r"id=mobi.infolife.ezweather.widget.(batteryandweather|localweatherapp)")), callback='parse_app',
             follow=False)
<<<<<<< HEAD
    ]  # CrawlSpider 会根据 rules 规则爬取页面并调用函数进行处理 .widget.(batteryandweather|localweatherapp)

コード例 #30

0

ファイルを表示

class GloboEsporteSpider(CrawlSpider):
    name = "globoesporte"
    allowed_domains = ["globoesporte.globo.com"]
    start_urls = ["http://globoesporte.globo.com/futebol/brasileirao-serie-a/noticia/plantao.html"]
    rules = [Rule(LinkExtractor(allow=('./noticia/.', )), follow=True, callback='parse_ge')]

    def parse_ge(self, response):
        for comentario in response.xpath('//div[@class="glbComentarios-conteudo-interno"]'):
            item = GloboEsporteItem()
            item['titulo'] = comentario.xpath('//title/text()').extract()
            item['autor'] = comentario.xpath('div/strong/text()').extract()
            item['texto'] = comentario.xpath('p[@class="glbComentarios-texto-comentario"]/text()').extract()
            item['link'] = response.url
            yield item