Ejemplo n.º 1
0
    def get_items_and_pagination(self, response):
        request_again = self.error_handler(response)
        if request_again:
            yield request_again
            return
        items_extractor = LinkExtractor(deny=[r'\/image\/', r'\/map'],
                                        restrict_xpaths='.//div[@class="itemInfo"]/h2')
        items_links = items_extractor.extract_links(response)
        for link in items_links:
            yield Request(url=link.url, callback=self.parse_item)
        if response.xpath('.//a[@class="next"]').extract():
            total_quantity = response.xpath(
                '(.//div[@class="pageResults"]/span[@class="results"]'
                '/text()[normalize-space()])[2]').re(r'\d+')
            if total_quantity:
                total_quantity = int(total_quantity[0])
                pages = total_quantity/25
                page_range = range(1, pages+2)

            category = cond_set_value(response.xpath(
                './/input[@id="FrmWho"]/@value').extract())
            quoted_category = quote_plus(category)
            for page in page_range:
                next_url = self.pagination_pattern.format(prase=quoted_category,
                                                          page=page)
                yield Request(url=next_url, headers=self.pagination_headers,
                              dont_filter=True, method='POST',
                              callback=self.parse_pagination)
 def get_companies_links(self, response):
     companies_link_extractor = LinkExtractor(allow=r'\/company_\d{5,7}')
     companies_links = companies_link_extractor.extract_links(response)
     for link in companies_links:
         yield Request(url=link.url,
                       callback=self.parse_item,
                       # cookies=None,
                       meta={'category': response.meta.get('category')})
    def _extract_links(self, response, params):
        """ parse links from response
            @return hrefs
        """

        params['allow_domains'] = tuple(self.allowed_domains)
        link_extractor = LinkExtractor(**params)
        return link_extractor.extract_links(response)
Ejemplo n.º 4
0
 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         url = urljoin(response.url, link.url)
         yield scrapy.Request(url, self.parse_link, meta={
             'splash': {
                 'args': {'har': 1, 'html': 0},
             }
         })
 def get_categories(self, response):
     # http://www.construction.co.uk/double-glazing-repairs/category_33.htm
     link_extractor = LinkExtractor(allow=r'\/category_\d+')
     links = link_extractor.extract_links(response)
     for link in links:
         category = link.text
         yield Request(url=link.url,
                       callback=self.get_companies_links_by_letter,
                       meta={'category':category})
 def get_companies_links_by_letter(self, response):
     # http://www.construction.co.uk/heating-contractors-and-consultants/22_A.htm
     letter_link_extractor = LinkExtractor(allow=r'\/\d+_[A-Z].htm')
     links_by_letter = letter_link_extractor.extract_links(response)
     if links_by_letter:
         for link in links_by_letter:
             yield Request(url=link.url,
                           callback=self.get_companies_links,
                           meta={'category': response.meta.get('category')})
     else:
         # there is no letters pagination at the page
         for request in self.get_companies_links(response):
             yield request
    def parse_items_links(self, response):
        categories_links_extractor = LinkExtractor(restrict_xpaths='.//div[@class="categoryListContainer"]')
        cat_links = categories_links_extractor.extract_links(response)
        for link in cat_links:
            yield Request(url=link.url, callback=self.parse_items_links)

        items_links_extractor = LinkExtractor(restrict_xpaths='.//div[@class="directory-listing"]/h3')
        items_links = items_links_extractor.extract_links(response)
        for link in items_links:
            yield Request(url=link.url, callback=self.parse_item)

        pagination_link = cond_set_value(response.xpath('.//a[@class="more"]/@href').extract())
        if pagination_link:
            full_pagination_link = urljoin(self.start_urls[0], pagination_link)
            yield Request(url=full_pagination_link, callback=self.parse_items_links)
Ejemplo n.º 8
0
 def parse_categories(self, response):
     request_again = self.error_handler(response)
     if request_again:
         yield request_again
         return
     categories_extractor = LinkExtractor(
         restrict_xpaths='.//ul[@class="popTermsList"]')
     categories_links = categories_extractor.extract_links(response)
     for link in categories_links:
         yield Request(url=link.url, callback=self.get_items_and_pagination)
     letters_extractor = LinkExtractor(
         restrict_xpaths='.//div[@class="popTermsNavBar"]')
     letters_links = letters_extractor.extract_links(response)
     for link in letters_links:
         yield Request(url=link.url, callback=self.parse_categories)
Ejemplo n.º 9
0
class tripadvisorSpider(CrawlSpider):
    name = "istanbulrestorant"
    DOWNLOAD_DELAY = 1
    allowed_domains = ["tripadvisor.com.tr"]
    start_urls = []

    def my_range(start, end, step):
        while start <= end:
            yield start
            start += step

    for i in my_range(30, 10950, 30):
        start_urls.append(
            'http://www.tripadvisor.com.tr/Restaurants-g293974-oa' + str(i) +
            '-Istanbul.html')
    rules = [Rule(LinkExtractor(allow=['/Restaurant_Review.*']), 'parse_page')]

    def parse_page(self, response):

        item = items.IstanbulrestorantscrapItem()
        item['isim'] = response.xpath(
            '//*[@id="BREADCRUMBS"]/li[last()]/text()')[0].extract()
        item['adres'] = response.xpath(
            '//*[@id="ABOVE_THE_FOLD"]/div[1]/div[2]/div/div[2]/div/div[1]/div/address/span/span/span[1]/text()'
        )[0].extract()
        item['tel'] = response.xpath(
            '//*[@id="ABOVE_THE_FOLD"]/div[1]/div[2]/div/div[2]/div/div[1]/div/div/div/div[1]/div/text()'
        )[0].extract()

        yield item
Ejemplo n.º 10
0
class NewyorkCrawlerSpider(CrawlSpider):
    name = 'newyorkcrawler'
    idx = 0
    allowed_domains = ['www.nytimes.com']
    start_urls = ['https://www.nytimes.com/section/world/europe']
    rules = (Rule(LinkExtractor(allow=[r'\d{4}/\d{2}/\d{2}/[^/]+']),
                  callback="parse_item",
                  follow=True), )

    def parse_item(self, response):
        self.log("Scraping: " + response.url)

        item = NytItem()
        item['url'] = response.url
        a = Article(response.url)
        # According to the source, this doesn't download anything (i.e. opens a connection), if input_html is not None
        a.download(input_html=response.text)
        a.parse()
        item['title'] = a.title
        item['authors'] = a.authors
        item['body'] = a.text
        # TODO: add tags

        f = open('articles/%d-%s' % (self.idx, a.title), 'w+', encoding='utf8')
        f.writelines(a.authors)
        f.write("\n" + response.url + "\n")
        f.write(a.text)
        f.close()
        self.idx += 1
        return item
Ejemplo n.º 11
0
class CsdnSpider(CrawlSpider):
    name='csdn'
    allowed_domains=['csdn.net']
    start_urls=['http://blog.csdn.net/qq_35037977/article/list/1']
    rules=[
            Rule(LinkExtractor(allow=r'/qq_35037977/article/list/\d+',restrict_xpaths="//a[contains(., %s)]"%nextpage),follow=True),
            Rule(LinkExtractor(allow=r'/qq_35037977/article/details/\d+',restrict_css='.link_title'),callback='parse_csdn')
    ]

    def parse_csdn(self,response):
        l=ItemLoader(item=CsdnItem(),response=response)
        l.add_css('title','#article_details .link_title a::text')
        l.add_value('link',response.url)
        l.add_css('posttime','.article_r .link_postdate::text')
        l.add_css('views','.article_r .link_view::text')
        yield l.load_item()
Ejemplo n.º 12
0
class MovieSpider(CrawlSpider):
    name = "movie"
    allowed_domains = ["douban.com"]
    start_urls = ["http://movie.douban.com/tag/%s?type=S" % one_tag]
    index = 0
    rules = (
        # 提取匹配 'http://movie.douban.com/tag/爱情' 的翻页
        # Rule(LinkExtractor(allow=(('tag/%s.start=' % quote(one_tag.encode("utf-8"))), ))),
        # 提取匹配 'subject/\d+' 的链接并使用spider的parse_item方法进行分析
        Rule(LinkExtractor(allow=('subject/\d+', )), callback='parse_item'), )

    def parse_item(self, response):
        item = MovieItem()
        item['title'] = response.xpath(
            "//div[@id='content']/h1/span[1]/text()").extract()[0]
        item['url'] = response.url
        try:
            item['desc'] = response.xpath(
                "//div[@id='link-report']/span/text()").extract()[0].strip()
        except:
            item['desc'] = ''
        try:
            item['score'] = response.xpath(
                "//strong[@class='ll rating_num']/text()").extract()[0]
        except:
            item['score'] = 0
        item['image_urls'] = response.xpath(
            "//div[@id='mainpic']/a[@class='nbgnbg']/img/@src").extract()

        print item['title'], item['score'], item['url'], item['desc']
        yield item
Ejemplo n.º 13
0
class ZZSpider(CrawlSpider):
    name = "zz_gunter-spb"
    allowed_domains = ["livejournal.com"]
    start_urls = [
        #"http://tanyant.livejournal.com/118267.html"
        #"http://gunter-spb.livejournal.com/14196.html"
        #"http://gunter-spb.livejournal.com/2387127.html"
        #"http://gunter-spb.livejournal.com/610032.html"
        "http://gunter-spb.livejournal.com/599654.html"
    ]

    rules = (
        Rule(
            LinkExtractor(
                #allow=('http://tanyant.livejournal.com/\d+\.html',),
                deny=('tag', 'reply', 'thread', 'page'),
                # xpath for snorapp, tanyant
                # restrict_xpaths=('//a[@title="Previous"]')
                restrict_xpaths=("//i[@class='b-controls-bg']/parent::a"),
            ),
            callback='parse_page',
            follow=True), )

    def parse_page(self, response):

        # use scrapy shell to find xpath
        #from scrapy.shell import inspect_response
        #inspect_response(response)

        item = ScraperItem()
        item['url'] = response.url

        try:
            item['title'] = response.xpath('//h1/text()').extract()[0]
        except IndexError:
            item['title'] = ""

        try:
            item['text'] = " ".join(
                response.xpath('//article[2]/child::node()').extract())

        except IndexError:

            item['text'] = ''

        try:
            time = response.xpath("//time[1]/text()[3]").extract()[0]
            date = response.xpath("//time[1]/a/text()").extract()
            date.append(time)
            item['date'] = date
        except IndexError:
            item['date'] = ''

        try:
            item['comment_count'] = response.xpath(
                '//span[@class="js-amount"]/text()').extract()[0]
        except IndexError:
            item['comment_count'] = '0'

        yield item
Ejemplo n.º 14
0
class Mess(CrawlSpider):
    My_Tree.objects.all().delete()
    name = 'mess'
    allowed_domains = ["localhost"]
    start_urls = [
        start_url(),
    ]
    rules = [
        Rule(
            LinkExtractor(restrict_xpaths=('//a[@class="next"]')),
            callback='parse_item',
            follow=True,
        )
    ]

    def parse_item(self, response):
        hxs = response
        item = MyScrapyItem()
        item['name'] = hxs.xpath(
            '//*[@id="content"]/a/button/text()').extract()
        item['url'] = response.url
        item['link'] = hxs.xpath('//a/@href').extract()
        s = My_Tree(url=item['url'], link=item['link'], name=item['name'])
        s.save()
        print item
        return item
Ejemplo n.º 15
0
class ExampleSpider(CrawlSpider):
    name = 'pic'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['https://www.zhihu.com/topic/19643259/hot']
    #item = ZhihuItem()

    rules = [
        Rule(LinkExtractor(allow=['/question/.*','/people/.*']),
        callback = 'parse_item',
        follow = True)
    ]


    def parse_item(self, response):
        #print response.css('h1').extract()
        image = ZhihuItem()
        #head_url = re.sub(r'_l\.', '.', ''.join(response.css('.body .Avatar--l::attr(src)').extract()))


        image['title'] = response.xpath("//img/@alt").extract()
        #image['image_urls'] = arr
        rel = response.xpath("//img/@srcset").extract()
        #print 'adsfaksjdfawp9eifh', rel
        #print 'asdkfaiehfkajsbdfj', rel[0]

        for i in range(len(rel)):
            rel[i] = re.sub(' 2x','',rel[i])
        image['image_urls'] = rel

        #rel[0] = re.sub(' 2x','',rel[0])
        #image['image_urls'] = [rel[0]]
        #print image['image_urls']
        return image
Ejemplo n.º 16
0
class tiebaSpider(CrawlSpider):
    name = "tieba"
    allowed_domains = ["baidu.com"]
    start_urls = ["http://tieba.baidu.com/p/5051125142"]
    rules = [
        Rule(LinkExtractor(allow=("/5051125142\?pn=(\d)")),
             follow=True,
             callback='parse_cont'),
    ]

    def parse11(self, response):
        pass

    def parse_cont(self, response):
        items = []
        select = Selector(response)
        text = select.css('div.l_post.j_l_post.l_post_bright')
        # authors = select.css('div.d_author')
        # contents = select.css('div.d_post_content_main')
        for list in text:
            item = BaiduttItem()
            item['name'] = list.css('.d_author ul li.d_name a').xpath(
                'text()').extract()
            item['content'] = list.css('.d_post_content_main cc div').xpath(
                'text()').extract()
            items.append(item)
        return items
Ejemplo n.º 17
0
    def __init__(self, *args, **kwargs):
        # run using: scrapy crawl xss_spider -a url='http://example.com'
        super(XSSspider, self).__init__(*args, **kwargs)
        self.start_urls = [kwargs.get('url')]
        hostname = urlparse(self.start_urls[0]).hostname
        # With subdomains
        self.allowed_domains = [hostname] # adding [] around the value seems to allow it to crawl subdomain of value
        self.delim = '1zqj'
        # semi colon goes on end because sometimes it cuts stuff off like
        # gruyere or the second cookie delim
        self.test_str = '\'"(){}<x>:/'

        # Login details
        self.login_user = kwargs.get('user')
        if self.login_user == 'None':
            self.login_user = None
        else:
            # Don't hit links with 'logout' in them since self.login_user exists
            self.rules = (Rule(LinkExtractor(deny=('logout')), callback='parse_resp', follow=True), )
        if kwargs.get('pw') == 'None' and self.login_user is not None:
            self.login_pass = raw_input("Please enter the password: ")
        else:
            self.login_pass = kwargs.get('pw')

        # HTTP Basic Auth
        self.basic_auth = kwargs.get('basic')
        if self.basic_auth == 'true':
            self.http_user = self.login_user
            self.http_pass = self.login_pass
Ejemplo n.º 18
0
class MovieSpider(CrawlSpider):
    name='vDown'
    allowed_domains=['91porn.it']
    searchText='我'
    start_urls=['http://www.91porn.it/search?search_query='+urllib.quote(searchText)+'&search_type=videos']
    rules=[
           #Rule(LinkExtractor(allow=(r'http://movie.douban.com/top250\?start=\d+.*'))),
           #Rule(LinkExtractor(allow=(r'class=\"well well-sm[\s\S]*?href=\"(\S+?)\"[\s\S]*?img[\s]src=\"(\S+?)\"')),callback='parse_item'),
           #Rule(LinkExtractor(allow=(r'http://www.91porn.it/video/\d+"')),callback='parse_item')
           Rule(LinkExtractor(allow=(r'/video/\d+/"')),callback='parse_item')
           
    ]
    count=2
    
    def parse_item(self,response):
        item=MyItem()
        item['url']=response.url
        
        print(response.url)
        
        videoPat=r'\<video[\s\S]*?poster=\"(\S+?)\"[\s\S]*\<source\s+src=\"(\S+?)\"';
        
        m=re.match(videoPat,response.body)
        print(m.groups())
        
        #item['file_urls']=response.xpath("//img/@src").extract()
        return item
Ejemplo n.º 19
0
class Image1Spider(CrawlSpider):
    name = "image1"
    allowed_domains = [
        "developers.google.com",
        "developer.chrome.com",
        "developer.android.com",
        "cloud.google.com",
        "googledevelopers.blogspot.com",

        #"appurify.com",
        #"www.chromium.org",
        #"www.firebase.com",
        #"golang.org",
        #"www.html5rocks.com",
        #"www.stackdriver.com",
        "www.dartlang.org",
        "developer.nest.com",
        "www.polymer-project.org",
    ]
    start_urls = ('https://developers.google.com/', )

    rules = [
        Rule(LinkExtractor(allow=[r'.*']), callback='parse_item', follow=True)
    ]

    def parse_item(self, response):
        content = scrapy.Selector(response=response).xpath('//body')

        for node in content:
            item = GioItem()
            item['url'] = response.url
            item['image_urls'] = node.xpath('//img/@src').extract()

            yield item
Ejemplo n.º 20
0
class tripadvisorSpider(CrawlSpider):
    name = "karadenizOtel"
    DOWNLOAD_DELAY = 1
    allowed_domains = ["tripadvisor.com.tr"]
    start_urls = []

    def my_range(start, end, step):
        while start <= end:
            yield start
            start += step

    for i in my_range(0, 180, 30):
        start_urls.append('http://www.tripadvisor.com.tr/Hotels-g673665-oa' +
                          str(i) + '-Turkish_Black_Sea_Coast-Hotels.html')
    rules = [Rule(LinkExtractor(allow=['/Hotel_Review.*'], ), 'parse_page')]

    def parse_page(self, response):

        item = items.KaradenizotelscrapItem()
        item['isim'] = response.xpath('//*[@id="HEADING"]/text()')[1].extract()
        item['adres'] = response.xpath(
            '//*[@id="HEADING_GROUP"]/div/div[3]/address/div[1]/span/span[1]/text()'
        )[0].extract()
        item['sehir'] = response.xpath(
            '//*[@id="BREADCRUMBS"]/li[4]/a/span/text()')[0].extract()
        item['ilce'] = response.xpath(
            '//*[@id="BREADCRUMBS"]/li[5]/a/span/text()')[0].extract()

        yield item
Ejemplo n.º 21
0
class PicSpider(CrawlSpider):
    name = "first"
    allowed_domains = ["www.reddit.com"]
    start_urls = ['http://www.reddit.com/r/paypal/']

    rules = [
        Rule(LinkExtractor(allow=['/r/paypal/\?count=\d*&after=\w*']),
             callback='parse_item',
             follow=True)
    ]

    # rules = [
    #     # Traverse the in the /r/pics subreddit. When you don't pass
    #     # callback then follow=True by default.
    #     # It's also important to NOT override the parse method
    #     # the parse method is used by the CrawlSpider continuously extract links
    #     Rule(LinkExtractor(
    #     	allow=['/r/pics/\?count=\d*&after=\w*']),
    #     	callback='parse_item',
    #     	follow=True),
    # ]

    def parse_item(self, response):

        selector_list = response.css('div.thing')

        for selector in selector_list:
            item = PicItem()

            item['title'] = selector.xpath('div/p/a/text()').extract()
            item['url'] = selector.xpath('a/@href').extract()

            yield item
Ejemplo n.º 22
0
class DBMSpider(CrawlSpider):
    # 爬虫名称,运行的时候用到
    name = "doubanMovie"
    # 允许的域名
    allowed_domains = ["movie.douban.com"]
    # 爬虫的种子地址,即第一个地址
    start_urls = ["https://movie.douban.com"]
    #该spider将从https://movie.douban.com的首页开始爬取,获取subject的链接后使用parse_item方法。
    rules = (
        # 这里会将符合r'/subject/\d+/'这个正则的地址的网页用parse_subject这个方法解析
        Rule(LinkExtractor(allow=(r'/subject/\d+/', )),
             callback='parse_subject',
             follow=True), )

    # 这里可以用xpath或者css选择器来获取内容,直接用chrome开发者工具就可以了
    def parse_subject(self, response):
        item = DoubanmovieItem()
        # todo extract item content
        item['movie_name'] = response.xpath(
            '//*[@id="content"]/h1/span[1]').xpath(
                'normalize-space(string(.))').extract()[0]
        item['intro'] = response.xpath('//*[@id="link-report"]/span').xpath(
            'normalize-space(string(.))').extract()[0]
        item['actors'] = response.xpath(
            '//*[@id="info"]/span[3]/span[2]').xpath(
                'normalize-space(string(.))').extract()
        item['date'] = response.xpath('//*[@id="info"]/span[11]').xpath(
            'normalize-space(string(.))').extract()[0]
        item['director'] = response.xpath(
            '//*[@id="info"]/span[1]/span[2]/a').xpath(
                'normalize-space(string(.))').extract()[0]
        return item
Ejemplo n.º 23
0
class MySpider(CrawlSpider):
    name = "Huntsman"
    allowed_domains = ["essex.ac.uk"]
    start_urls = ["https://www.essex.ac.uk/"]
    rules = [
        Rule(LinkExtractor(allow=('/www.essex.ac.uk/((?!:).)*$'), ),
             callback="parse_item",
             follow=True)
    ]

    def parse_item(self, response):
        item = BeastItem()

        url = response.url
        item["url"] = url

        title = response.xpath("//title/text()")[0].extract()
        item["title"] = title

        description = response.xpath(
            "//meta[@name='description']/@content").extract()
        item["description"] = description

        body = response.xpath('//body//text()').re(r'(\w[ ,\'\-\w]+\w)')
        item["body"] = body

        return item
Ejemplo n.º 24
0
class StackCrawlerSpider(CrawlSpider):
    name = 'stack_crawler'
    allowed_domains = ['stackoverflow.com']
    # start_urls = ['http://www.stackoverflow.com/']
    start_urls = ['http://stackoverflow.com/questions?pagesize=50&sort=newest']
    # rules = (
    #     Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    # )
    rules = [
        Rule(LinkExtractor(allow=r'questions\?page=[0-9]&sort=newest'),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        # i = StackItem()
        # #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        # #i['name'] = response.xpath('//div[@id="name"]').extract()
        # #i['description'] = response.xpath('//div[@id="description"]').extract()
        # return i
        questions = response.xpath('//div[@class="summary"]/h3')

        for question in questions:
            item = StackItem()
            item['url'] = question.xpath(
                'a[@class="question-hyperlink"]/@href').extract()[0]
            item['title'] = question.xpath(
                'a[@class="question-hyperlink"]/text()').extract()[0]
            yield item
Ejemplo n.º 25
0
class BaiduSpider(CrawlSpider):
    name = "baidu"
    allowed_domains = ["baidu.com"]
    start_urls = ('http://shouji.baidu.com/software/?from=as', )
    rules = [
        Rule(LinkExtractor(allow=("http://shouji.baidu.com/soft/item", )),
             callback='parse_app',
             follow=True),
    ]

    def parse_app(self, response):
        apk = AppItem()
        apk['url'] = response.url
        apk['name'] = response.css('.app-name>span').extract()[0]
        apk['rate'] = response.css(".star-percent").xpath(
            "@style").extract()[0]
        apk['size'] = response.css(".detail > span.size").xpath(
            "text()").extract()[0]
        apk['category'] = response.css(".nav").css("a")[1].xpath(
            "text()").extract()[0]
        apk['apk_url'] = response.css(".apk").xpath("@href").extract()[0]
        apk['screenshots'] = response.css(".imagefix").xpath("@src").extract()
        apk['download_num'] = response.css("span.download-num").xpath(
            "text()").extract()[0]
        yield apk
Ejemplo n.º 26
0
class LogoSpider(CrawlSpider):
    name = 'logo'
    allowed_domains = ['pcauto.com.cn']
    start_urls = ['http://www.pcauto.com.cn/zt/chebiao/guochan/']

    rules = (
            Rule(LinkExtractor(allow=(r'http://www.pcauto.com.cn/zt/chebiao/.*?/$')), callback='parse_page'),
        )


    def parse_page(self, response):
        # print(response.text)
        sel = Selector(response)
        # print(sel)
        country = "".join(sel.xpath('//div[@class="th"]/span[@class="mark"]/a/text()').extract())
        # carname = sel.xpath('//div[@class="dPic"]/i[@class="iPic"]/a/img/@alt').extract()
        # imageurl = sel.xpath('//div[@class="dPic"]/i[@class="iPic"]/a/img/@src').extract()
        # item=LogoItem(country=country, carname=carname, imageurl=imageurl)
        # yield item

        carnames = sel.xpath('//div[@class="dPic"]/i[@class="iPic"]/a/img/@alt').extract()
        for carname in carnames:
            imageurl= sel.xpath('//div[@class="dPic"]/i[@class="iPic"]/a/img[@alt="'+carname+'"]/@src').extract()
            # print(country, carname, imageurl)
            item = LogoItem(country=country, carname=carname, imageurl=imageurl)
            yield item
Ejemplo n.º 27
0
class GlassdoorSpider(scrapy.Spider):
    name = 'glassdoor'
    allowed_domains = [
        'glassdoor.com', 'www.glassdoor.com', 'www.glassdoor.com.au'
    ]
    start_urls = ['https://www.glassdoor.com']

    rules = [
        Rule(LinkExtractor(allow=r'\/Reviews\/.*'),
             callback='parse',
             follow=True)
    ]

    def start_requests(self):
        urls = [
            'https://www.glassdoor.com.au/Reviews/sydney-reviews-SRCH_IL.0,6_IM962.htm',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for company in response.css('div.eiHdrModule'):
            yield {
                'name':
                company.css("a.tightAll::text").extract_first().strip(),
                'score': company.css("span.bigRating::text").extract_first(),
                'reviews': company.css("span.num::text")[0].extract().strip(),
                'salaries': company.css("span.num::text")[1].extract().strip(),
                'interviews':
                company.css("span.num::text")[2].extract().strip(),
            }

        for href in response.css('li.page a::attr(href)'):
            yield response.follow(href, callback=self.parse)
Ejemplo n.º 28
0
class ImageSpider(CrawlSpider):
    name = "picSpider"

    start_urls = ["https://www.reddit.com/r/pics/"]

    allowed_domains = [
        #Domain allowed to scrape
        "www.reddit.com"
    ]

    rules = (
        # Extract links matching 'below Regex'
        # and follow links from them (No callback means follow=True by default).
        Rule(LinkExtractor(allow=(".*\/r\/pics\/\?count=\d*&after=(\w*)", )),
             callback='parse_next',
             follow=True), )

    def parse_next(self, response):
        #Selecting list of elements and parsing one by one
        selector_list = response.xpath("//div[contains(@class,'thing')]")
        for selector in selector_list:
            print(selector)
            item = ItemStack.PicItem()
            item['title'] = selector.xpath("div/p/a/text()").extract()
            #            item['link_url']=selector.xpath('p[contains(@class,\'title\')]/a/@href').extract()
            item['image_urls'] = selector.xpath(
                "a[contains(@class,'thumbnail')]/@href").extract()
            yield item
Ejemplo n.º 29
0
class MoocSpider(CrawlSpider):

    name = 'mooc'
    allowed_domains = ['mooc.cn']
    start_urls = []

    for i in range(1, 37):
        start_urls.append('http://www.mooc.cn/course/page/%d' % i)

    rules = [Rule(LinkExtractor(allow=['/.*\.html']), 'parse_mooc')]

    def parse_mooc(self, response):
        mooc = MoocItem()
        moocs = []

        mooc['url'] = response.url
        ch_name = response.xpath("//h1/text()").extract()
        en_name = response.xpath(
            "//div[@class='course_enname']/text()").extract()
        university = response.xpath("//h2[1]/text()").extract()
        time = response.xpath("//div[@class='coursetime']/text()").extract()
        desc = response.xpath(
            "//div[@class='content-entry clearfix']/p[1]/text()").extract()

        mooc['ch_name'] = [m.encode('utf-8') for m in ch_name]
        mooc['en_name'] = [m.encode('utf-8') for m in en_name]
        mooc['university'] = [m.encode('utf-8') for m in university]
        mooc['time'] = [m.encode('utf-8') for m in time]
        mooc['desc'] = [m.encode('utf-8') for m in desc]

        moocs.append(mooc)

        return moocs
Ejemplo n.º 30
0
class Jeeran(CrawlSpider):
    name = 'jeeran'
    allowed_domains = ['yellowpages.com.eg']
    start_urls = ['http://www.yellowpages.com.eg/en/category/pharmacies']
    #depth_limit= 0
    rules = (Rule(LinkExtractor(allow=('')), callback='parse_obj',
                  follow=True), )
Ejemplo n.º 31
0
class PitchforkSpider(CrawlSpider):
    name = 'pitchfork'
    allowed_domains = ['pitchfork.com']
    start_urls = [
        'https://pitchfork.com/reviews/albums/',
        'https://pitchfork.com/reviews/albums/?page=2',
        'https://pitchfork.com/reviews/albums/?page=3',
        'https://pitchfork.com/reviews/albums/?page=4',
        'https://pitchfork.com/reviews/albums/?page=5'
    ]

    rules = [Rule(LinkExtractor(allow=''), callback='parse', follow=True)]

    def parse(self, response):
        artists = response.xpath('//ul/li[1]/text()').extract()
        album = response.xpath('//h2/text()').extract()
        urls = response.xpath('//div[@class = "review"]/a/@href').extract()
        url = [BASE_URL + link for link in urls]

        for link in url:
            request = scrapy.Request(link,
                                     callback=self.review_text,
                                     dont_filter=True)
            yield request

    def review_text(self, response):
        text = response.xpath('//p/text()').extract()
        title = response.xpath('//h2/ul/li/a/text()').extract()
        album = response.xpath('//h1/text()').extract()

        yield ReviewItem(artists=title, album=album, text=text)
Ejemplo n.º 32
0
    def __init__(self, seed_urls=None, save_html=1, use_splash=1, screenshot_dir='/memex-pinterest/ui/static/images/screenshots', op_time=10, **kwargs):
        '''
        Constructs spider instance from command=line or scrapyd daemon.

        :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage
        :param save_html: boolean 0/1
        :param use_splash: boolean 0/1
        :param screenshot_dir: used only when use_splash=1
        :param op_time: operating time in minutes, negative - don't use that constraint
        :param kwargs:
        :return:
        '''
        super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir, **kwargs)
        self.screenshot_dir = screenshot_dir
        log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir), _level=log.DEBUG)

        if seed_urls:
            self.start_urls = [add_scheme_if_missing(url) for url in seed_urls.split(',')]
        self.ranker = Ranker.load()
        self.linkextractor = LinkExtractor()
        self.save_html = bool(save_html)
        self.use_splash = bool(use_splash)
        self.operating_time = int(op_time) * 60

        self.start_time = datetime.utcnow()
        self.finishing = False
Ejemplo n.º 33
0
class tripadvisorSpider(CrawlSpider):
    name = "ege"
    DOWNLOAD_DELAY = 1
    allowed_domains = ["tripadvisor.com.tr"]
    start_urls = [
        "http://www.tripadvisor.com.tr/Restaurants-g657096-Turkish_Aegean_Coast.html"
    ]
    def my_range(start, end, step):
        while start <= end:
            yield start
            start += step

    #for i in my_range(30, 6900, 30):
        #start_urls.append('http://www.tripadvisor.com.tr/Restaurants-g657096-oa'+str(i)+'-Turkish_Aegean_Coast.html')
    rules = [Rule(LinkExtractor(allow=['/Restaurant_Review.*'],deny = 'http://www.tripadvisor.com.tr/Restaurants-g657096-Turkish_Aegean_Coast.html'), 'parse_page')]
    def parse_page(self, response):

        item = items.EgerestorantscrapItem()
        item['isim'] = response.xpath('//*[@id="BREADCRUMBS"]/li[last()]/text()')[0].extract()
        item['adres'] = response.xpath('//*[@id="ABOVE_THE_FOLD"]/div[1]/div[2]/div/div[2]/div/div[1]/div/address/span/span/span[1]/text()')[0].extract()
        item['sehir'] = response.xpath('//*[@id="BREADCRUMBS"]/li[4]/a/span/text()')[0].extract()
        item['tel'] = response.xpath('//*[@id="ABOVE_THE_FOLD"]/div[1]/div[2]/div/div[2]/div/div[1]/div/div/div/div[1]/div/text()')[0].extract()
        item['ilce'] = response.xpath('//*[@id="BREADCRUMBS"]/li[5]/a/span/text()')[0].extract()

        yield item
Ejemplo n.º 34
0
class cqrbSpider(CrawlSpider):
    name = site_name
    allowed_domains = [
        "cqrbepaper.cqnews.net",
    ]
    start_urls = url_list

    rules = (
        Rule(LinkExtractor(allow=('/cqrb/html/\d{4}-\d{2}/\d{2}/content.+$')), \
        callback='parse_data', follow=True,),
    )

    def parse_data(self, response):
        # get the publish time and store the fils by year/month
        time = response.url.split('/')[5]
        year = time[0:4]
        month = time[5:7]
        path = data_dir + '/' + year + '/' + month
        if not os.path.exists(path):
            os.makedirs(path)
        # Get the title
        title = response.xpath('//tr/td/strong/text()').extract()[1].strip().encode('utf-8')
        # get the content
        content_list = response.xpath('//*[@id="ozoom"]/founder-content//text()').extract()
        content = "".join(content_list).strip().encode("utf-8")
        # If the time or the content is empty,Means we get the wrong page
        # Do not create the file
        if title and content:
            filename = path + '/' +  title + '.txt'
            with open(filename, 'wb') as f:
                f.write(content)
Ejemplo n.º 35
0
class ProxySpider(CrawlSpider):
    name = "proxy"
    #allowed_domains = ['xici.net.co', 'youdaili.net']
    start_urls = [
        r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy",
        r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy&pn=10",
        r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy&pn=20",
        r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy&pn=30",
        r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy&pn=40",
        r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy&pn=50",
        r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy&pn=60",
        r"http://www.gfsoso.net/?q=ip+proxy&t=1",
        r"http://www.gfsoso.net/?q=ip+proxy&pn=10",
        r"http://www.gfsoso.net/?q=ip+proxy&pn=20",
        r"http://www.gfsoso.net/?q=ip+proxy&pn=30",
        r"http://www.gfsoso.net/?q=ip+proxy&pn=40",
        r"http://www.gfsoso.net/?q=ip+proxy&pn=50",
        r"http://www.gfsoso.net/?q=ip+proxy&pn=60",
    ]

    rules = (Rule(LinkExtractor(allow=(r'', )), callback='parse_item'), )

    def parse_item(self, response):
        soup = BeautifulSoup(response.body)
        str_list = [tag.string or '' for tag in soup.find_all(True)]
        body_str = ' '.join(str_list)
        # items = [ GoproxyItem(ip=group[0], port=group[7], protocol='HTTP') for group in re.findall(REG_IP, body_str) ]
        for group in re.findall(REG_IP, body_str):
            proxy_item, created = ProxyItem.objects.update_or_create(
                ip=group[0])
            proxy_item.port = group[7]
            proxy_item.save()
Ejemplo n.º 36
0
class QuotesSpider(scrapy.Spider):

    name = "express"  # name of spider to be called during execution

    start_urls = ['http://indianexpress.com/section/india/']
    rules = (Rule(LinkExtractor(
        allow=(), restrict_css=('.yt-uix-button-content a ::attr(href)')),
                  callback="parse_page",
                  follow=True), )

    def parse(self, response):

        for quote in response.css(
                'div.articles'
        ):  # content taken from article(storyclass of manorama website
            yield {
                'Date': quote.css('div.date::text').extract_first(
                ),  # date to be taken from section-teaserlist of website
                'Headlines': quote.css('div.title a::text').extract_first(),
                'link': quote.css('div.title a::attr(href)').extract()
            }

        NEXT_PAGE_SELECTOR = 'div.pagination a ::attr(href)'
        next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
        try:
            next_page = response.css(
                'div.pagination a::attr(href)').extract()[0]

            yield scrapy.Request(response.urljoin(next_page),
                                 callback=self.parse)
        except IndexError:
            pass
Ejemplo n.º 37
0
class BbcSpider(CrawlSpider):
    name = "bbc"
    #Crawler will no go beyond this domains
    allowed_domains = ["bbc.com"]
    #Where to start crawling
    start_urls = ('http://www.bbc.com/', )
    rules = (
        # Change rule as per the requirements otherwise it will take hours
        #Rule(LinkExtractor(allow=r'/news/[A-Za-z0-9]'), callback='parse_item', follow=True),
        Rule(LinkExtractor(allow=r'/news/[a-z]'),
             callback='parse_item',
             follow=True), )

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        item = CrawlerItem()
        #Use xpath or class selector
        #item['text'] = response.xpath('//*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]/div[3]/p[1]//text()').extract()[0]
        item['text'] = hxs.select(
            '//p[@class="story-body__introduction"]/text()').extract()
        item['author'] = "BBC News Media"
        #item['headline'] = response.xpath('//*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]/h1//text()').extract()[0]
        item['headline'] = hxs.select(
            '//h1[@class="story-body__h1"]/text()').extract()
        item['url'] = response.url
        yield item
Ejemplo n.º 38
0
    def parse(self, response):
        print 'startPageSpider==========================>',response.url
#         log.msg(format='%(iPid)s, %(url)s, %(project)s ', iPid = self.taskId, url = response.url, project=self.project)
        listQuqueCount = self.redis.llen('scrapy:startPageSpider:listQuque:%s' % self.taskId)
        if listQuqueCount == 1:
            self._crawler.signals.send_catch_log('writeListQuque')
        elif listQuqueCount == 0:
            self._crawler.signals.send_catch_log('emptyListQuque')
            print 'startPageSpider---------send_catch_log->emptyListQuque'
        if response.url not in self.hasCrawlSet:
            pattern = re.compile(r'%s' % self.project['szStartUrlReg'])
            self.hasCrawlSet.add(response.url)
            if pattern.match(response.url) and response.url not in self.hasInsertSet:
                title = "|".join(response.xpath('/html/head/title/text()').extract())
                insertSql = 'INSERT INTO project_start_page(iPid, szUrl, szTitle,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s")' % (self.taskId, response.url,  title,  time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
                self.dbUtils.insert(insertSql)
                self.hasInsertSet.add(response.url)
                self.redis.lpush('scrapy:startPageSpider:listQuque:%s' % self.taskId, response.url)
                #self.redis.sadd('scrapy:startPageSpider:startPage:2', response.url)
                log.msg(format='spider=startPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=title, u=response.url)

            _allow = ( _allow for _allow in self.project['szStartUrlReg'].split('~'))
            self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow)
            links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasCrawlSet ]
            for link in links:
                yield self.make_requests_from_url(link.url)
Ejemplo n.º 39
0
class solidotSpider(CrawlSpider):
    name = site_name
    allowed_domains = [
        "solidot.org",
    ]
    start_urls = url_list

    rules = (
        Rule(LinkExtractor(allow=('story\?sid=\d+$')), \
        callback='parse_data', follow=True,),
    )

    def parse_data(self, response):
        # get the publish time and store the fils by year/month
        date_string = response.xpath('//div[@class="talk_time"]/text()')\
        .extract()[2].split(' ')[2]
        year = date_string[0:4]
        month = date_string[5:7]
        path = data_dir + '/' + year + '/' + month
        if not os.path.exists(path):
            os.makedirs(path)
        # Get the title
        title = response.xpath(
            '//div[@class="bg_htit"]/h2/text()').extract()[0]
        # get the content
        content_list = response.xpath(
            '//div[@class="p_mainnew"]/text()').extract()
        content = "".join(content_list).strip().encode("utf-8")
        # If the time or the content is empty,Means we get the wrong page
        # Do not create the file
        if title and content:
            filename = path + '/' + title + '.txt'
            with open(filename, 'wb') as f:
                f.write(content)
Ejemplo n.º 40
0
class CountrySpider(CrawlSpider):
    name = 'country'
    allowed_domains = ['example.webscraping.com']
    start_urls = ['http://example.webscraping.com/']

    rules = (Rule(LinkExtractor(allow='/index/', deny='/user/'), follow=True),
             Rule(LinkExtractor(allow='/view/', deny='/user/'),
                  callback='parse_item'))

    def parse_item(self, response):
        item = ExampleItem()
        item['name'] = response.css(
            'tr#places_country__row td.w2p_fw::text').extract()
        item['population'] = response.css(
            'tr#places_population__row td.w2p_fw::text').extract()
        return item
Ejemplo n.º 41
0
class FollowAllSpider(Spider):

    name = 'followall'

    def __init__(self, **kw):
        super(FollowAllSpider, self).__init__(**kw)
        url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/'
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://%s/' % url
        self.url = url
        self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
        self.link_extractor = LinkExtractor()
        self.cookies_seen = set()

    def start_requests(self):
        return [Request(self.url, callback=self.parse, dont_filter=True)]

    def parse(self, response):
        """Parse a PageItem and all requests to follow

        @url http://www.scrapinghub.com/
        @returns items 1 1
        @returns requests 1
        @scrapes url title foo
        """
        page = self._get_item(response)
        r = [page]
        r.extend(self._extract_requests(response))
        return r

    def _get_item(self, response):
        item = Page(url=response.url, size=str(len(response.body)),
            referer=response.request.headers.get('Referer'))
        self._set_title(item, response)
        self._set_new_cookies(item, response)
        return item

    def _extract_requests(self, response):
        r = []
        if isinstance(response, HtmlResponse):
            links = self.link_extractor.extract_links(response)
            r.extend(Request(x.url, callback=self.parse) for x in links)
        return r

    def _set_title(self, page, response):
        if isinstance(response, HtmlResponse):
            title = Selector(response).xpath("//title/text()").extract()
            if title:
                page['title'] = title[0]

    def _set_new_cookies(self, page, response):
        cookies = []
        for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]:
            if cookie not in self.cookies_seen:
                self.cookies_seen.add(cookie)
                cookies.append(cookie)
        if cookies:
            page['newcookies'] = cookies
    def extract_links(self, response):
        # The parent can do most of it for us
        links = LinkExtractor.extract_links(self, response)
        try:
            good_links = [link for link in links if link.text.isdigit()]
        except TypeError:
            return None

        return good_links
Ejemplo n.º 43
0
 def __init__(self, **kw):
     super(FollowAllSpider, self).__init__(**kw)
     url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/'
     if not url.startswith('http://') and not url.startswith('https://'):
         url = 'http://%s/' % url
     self.url = url
     self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
     self.link_extractor = LinkExtractor()
     self.cookies_seen = set()
Ejemplo n.º 44
0
	def parse(self, response):
		# set the allowed domains in link
		ln_extractor = LinkExtractor(allow_domains=("news.sina.cn"),
			 allow = (".*vt=1.*"))
		# get the links from the response
		links = ln_extractor.extract_links(response)
		urls = []
		items = []
		for i in links:
			urls.append(i.url)
			# all the not visited urls are put into container and queue.
			if i.url not in self.g_container_urls:
				self.g_queue_urls.put(i.url)
				self.g_container_urls.add(i.url)
		# make all the request in the queue
		for j in range(self.g_queue_urls.qsize()):
			tp_url = self.g_queue_urls.get()
			items.append(self.make_requests_from_url(tp_url).
				replace(callback=self.parse_page))
			items.append(self.make_requests_from_url(tp_url))
		return items
Ejemplo n.º 45
0
class startPageSpider(Spider):

    name = 'startPageSpider'

    def __init__(self, taskId, *a, **kw):
        """Constructor"""
        self.name +='_'+str(taskId)
        super(startPageSpider, self).__init__(*a, **kw)
        pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
        self.redis = redis.Redis(connection_pool=pool)
        self.dbUtils = db.dbUtils()
        self.taskId = int(taskId)

        self.project = None
        self.domain = None
        self.hasCrawlSet = set()
        self.hasInsertSet = set()
        

        project = self.dbUtils.queryRow('SELECT * FROM project_setting WHERE iStatus=1 AND iPid=%d' % self.taskId)
        if  project :
            self.project = project
            self.start_urls = str(project['szStartUrl']).split('~')
            self.domain = ".".join(urlparse(project['szDomain']).hostname.split(".")[-2:])

    def parse(self, response):
        print 'startPageSpider==========================>',response.url
#         log.msg(format='%(iPid)s, %(url)s, %(project)s ', iPid = self.taskId, url = response.url, project=self.project)
        listQuqueCount = self.redis.llen('scrapy:startPageSpider:listQuque:%s' % self.taskId)
        if listQuqueCount == 1:
            self._crawler.signals.send_catch_log('writeListQuque')
        elif listQuqueCount == 0:
            self._crawler.signals.send_catch_log('emptyListQuque')
            print 'startPageSpider---------send_catch_log->emptyListQuque'
        if response.url not in self.hasCrawlSet:
            pattern = re.compile(r'%s' % self.project['szStartUrlReg'])
            self.hasCrawlSet.add(response.url)
            if pattern.match(response.url) and response.url not in self.hasInsertSet:
                title = "|".join(response.xpath('/html/head/title/text()').extract())
                insertSql = 'INSERT INTO project_start_page(iPid, szUrl, szTitle,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s")' % (self.taskId, response.url,  title,  time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
                self.dbUtils.insert(insertSql)
                self.hasInsertSet.add(response.url)
                self.redis.lpush('scrapy:startPageSpider:listQuque:%s' % self.taskId, response.url)
                #self.redis.sadd('scrapy:startPageSpider:startPage:2', response.url)
                log.msg(format='spider=startPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=title, u=response.url)

            _allow = ( _allow for _allow in self.project['szStartUrlReg'].split('~'))
            self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow)
            links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasCrawlSet ]
            for link in links:
                yield self.make_requests_from_url(link.url)
Ejemplo n.º 46
0
 def parse(self, response):
     #self.redis.sadd('scrapy:startPageSpider:startPage:3', response.url)
     if response.url not in self.hasCrawlSet:
         #self.redis.sadd('scrapy:startPageSpider:startPage:4', response.url)
         self.hasCrawlSet.add(response.url)
         _allow = ( _allow for _allow in self.project['szUrlReg'].split('~'))
         self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow)
         links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasInsertSet ]
         #self.redis.hset('scrapy:startPageSpider:listPage:count', response.url, len(links))
         for link in links:
             if link.url in self.hasInsertSet : continue
             insertSql = 'INSERT INTO project_list_page(iPid, szUrl, szTitle, szSourceUrl,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s", "%s")' % (self.taskId,link.url, link.text, response.url, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
             self.dbUtils.insert(insertSql)
             self.hasInsertSet.add(link.url)
             log.msg(format='spider=listPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=link.text, u=link.url)
Ejemplo n.º 47
0
 def parse(self, response):
     xlink = LinkExtractor()
     itemre = re.compile(self.itemurl_re)
     for link in xlink.extract_links(response):
         if itemre.search(link.url):
             yield Request(url=link.url, callback=self.parse_item)
 def __init__(self):
     LinkExtractor.__init__(self, restrict_xpaths='//div[@id="paging-bottom"]')
Ejemplo n.º 49
0
class listPageSpider(Spider):

    name = 'listPageSpider'

    def __init__(self, taskId, *a, **kw):
        """Constructor"""
        self.name +='_'+str(taskId)
        super(listPageSpider, self).__init__(*a, **kw)
        pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
        self.redis = redis.Redis(connection_pool=pool)
        self.dbUtils = db.dbUtils()
        self.taskId = int(taskId)
        self.domain = None
        self.project = None
        self.hasCrawlSet = set()
        self.hasInsertSet = set()
        self.isExit = 0
        project = self.dbUtils.queryRow('SELECT * FROM project_setting WHERE iStatus=1 AND iPid=%d' % self.taskId)
        if project :
            self.project = project
            self.domain = ".".join(urlparse(project['szDomain']).hostname.split(".")[-2:])
#             self.start_urls = ['http://www.ty2016.com/cn/2.html', 'http://www.ty2016.com/cn/3.html', 'http://www.ty2016.com/cn/4.html']


    def stopSpider(self):
        self.isExit = 1

    def getStartUrl(self):
        url = self.redis.rpop('scrapy:startPageSpider:listQuque')
        if not url:
            self.getStartUrl()
        return url


    def start_requests(self):
#         url = self.getStartUrl()
#         print '=====================>',url
#         yield self.make_requests_from_url(url)
        while True :
            #if self._crawler.engine is not None:
                #if self._crawler.engine.paused: break
                #if not self._crawler.engine.running: break
            url = self.redis.rpop('scrapy:startPageSpider:listQuque:%s' % self.taskId)
            #print 'listPageSpider==========================>',url
            if url:
                #self.redis.sadd('scrapy:startPageSpider:startPage:1', url)
                yield self.make_requests_from_url(url)
            #else:
                #self._crawler.signals.send_catch_log('emptyListQuque')
                #print 'listPageSpider---------send_catch_log->emptyListQuque'


    def parse(self, response):
        #self.redis.sadd('scrapy:startPageSpider:startPage:3', response.url)
        if response.url not in self.hasCrawlSet:
            #self.redis.sadd('scrapy:startPageSpider:startPage:4', response.url)
            self.hasCrawlSet.add(response.url)
            _allow = ( _allow for _allow in self.project['szUrlReg'].split('~'))
            self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow)
            links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasInsertSet ]
            #self.redis.hset('scrapy:startPageSpider:listPage:count', response.url, len(links))
            for link in links:
                if link.url in self.hasInsertSet : continue
                insertSql = 'INSERT INTO project_list_page(iPid, szUrl, szTitle, szSourceUrl,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s", "%s")' % (self.taskId,link.url, link.text, response.url, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
                self.dbUtils.insert(insertSql)
                self.hasInsertSet.add(link.url)
                log.msg(format='spider=listPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=link.text, u=link.url)
Ejemplo n.º 50
0
 def parse_urls(self, response):
     extractor = LinkExtractor(restrict_xpaths=('//table[contains(@class,ibm-data-table)]/tbody',))
     links = extractor.extract_links(response)
     for link in links:
         url = link.url
         yield Request(url, callback=self.parse_items)
Ejemplo n.º 51
0
class TopicalFinder(SplashSpiderBase):
    name = 'topical_finder'

    save_html = None
    use_splash = None

    def __init__(self, seed_urls=None, save_html=1, use_splash=1, screenshot_dir='/memex-pinterest/ui/static/images/screenshots', op_time=10, **kwargs):
        '''
        Constructs spider instance from command=line or scrapyd daemon.

        :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage
        :param save_html: boolean 0/1
        :param use_splash: boolean 0/1
        :param screenshot_dir: used only when use_splash=1
        :param op_time: operating time in minutes, negative - don't use that constraint
        :param kwargs:
        :return:
        '''
        super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir, **kwargs)
        self.screenshot_dir = screenshot_dir
        log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir), _level=log.DEBUG)

        if seed_urls:
            self.start_urls = [add_scheme_if_missing(url) for url in seed_urls.split(',')]
        self.ranker = Ranker.load()
        self.linkextractor = LinkExtractor()
        self.save_html = bool(save_html)
        self.use_splash = bool(use_splash)
        self.operating_time = int(op_time) * 60

        self.start_time = datetime.utcnow()
        self.finishing = False

    def start_requests(self):
        for url in self.start_urls:
            yield self.make_requests_from_url(url, is_seed=True)

    def make_requests_from_url(self, url, is_seed=False):
        if self.use_splash:
            r = self._splash_request(url)
        else:
            r = super(TopicalFinder, self).make_requests_from_url(url)
        r.meta['score'] = 0.0
        r.meta['is_seed'] = False

        if is_seed:
            r.meta['is_seed'] = True
            r.meta['score'] = 1.0  # setting maximum score value for seeds

        log.msg("Making request to %s with meta: %s" % (r.url, str(r.meta)), _level=log.DEBUG)

        return r

    def set_crawler(self, crawler):
        super(TopicalFinder, self).set_crawler(crawler)
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)

    def spider_idle(self):
        log.msg("Spider idle signal caught.", _level=log.DEBUG)
        raise DontCloseSpider

    def parse(self, response):
        ld = self._load_webpage_item(response, is_seed=response.meta['is_seed'])
        if self.use_splash:
            self._process_splash_response(response, ld)
        yield ld.load_item()

        if self.finishing:
            return

        now = datetime.utcnow()
        if self.operating_time > 0 and (now - self.start_time).total_seconds() > self.operating_time:
            log.msg("Reached operating time constraint. Waiting for Scrapy queue to exhaust.")
            self.finishing = True
            self.crawler.stop()
            return

        if not isinstance(response, TextResponse):
            return

        body = response.body_as_unicode().strip().encode('utf8') or '<html/>'
        score = self.ranker.score_html(body)
        log.msg("TC: %s has score=%f" % (response.url, score), _level=log.DEBUG)

        if score > 0.5:

            #!for some reason this is returning the raw splash response JSON
            #!and not the rendered HTML from splash
            #log.msg(u"\n\n\n****---Response body:\n %s----***\n\n\n" % response.body_as_unicode(), _level=log.DEBUG)

            #for link in self.linkextractor.extract_links(response):
            #can something like the line below fix it? Seems like a hack...
            for link in self.linkextractor.extract_links(response):

                log.msg("****---LINK EXTRACED: %s----***" % str(link.url), _level=log.DEBUG)

                if self.use_splash:
                    r = self._splash_request(url=link.url)
                else:
                    r = Request(url=link.url)

                external = is_external_url(response.url, link.url)
                depth = response.meta.get('link_depth', 0)
                r.meta.update({
                    'link': {
                        'url': link.url,
                        'text': link.text,
                        'fragment': link.fragment,
                        'nofollow': link.nofollow},
                    'link_depth': 0 if external else depth + 1,
                    'referrer_depth': depth,
                    'referrer_url': response.url,
                })

                url_parts = urlparse_cached(r)
                path_parts = url_parts.path.split('/')
                r.meta['score'] = 1.0 / len(path_parts)
                r.meta['is_seed'] = False
                yield r

    def _load_webpage_item(self, response, is_seed):
        depth = response.meta.get('link_depth', 0)
        ld = WebpageItemLoader(response=response)
        ld.add_value('url', response.url)
        ld.add_value('host', get_domain(response.url))
        ld.add_xpath('title', '//title/text()')
        ld.add_value('depth', depth)
        ld.add_value('total_depth', response.meta.get('depth'))
        ld.add_value('crawled_at', datetime.utcnow())
        ld.add_value('is_seed', is_seed)
        ld.add_value('crawler_score', response.meta['score'])

        if self.save_html:
            ld.add_value('html', response.body_as_unicode())

        if 'link' in response.meta:
            link = response.meta['link']
            ld.add_value('link_text', link['text'])
            ld.add_value('link_url', link['url'])
            ld.add_value('referrer_url', response.meta['referrer_url'])
            ld.add_value('referrer_depth', response.meta['referrer_depth'])
        return ld
 def parse_categories(self, response):
     l = LinkExtractor(restrict_xpaths='.//div[@class="categoryListContainer"]')
     links = l.extract_links(response)
     for link in links:
         yield Request(url=link.url, callback=self.parse_items_links)
Ejemplo n.º 53
0
	def parse_level3_contents(self, response):
		baseurl = response.xpath('//base/@href').extract()[0]
		le = LinkExtractor()
		for link in le.extract_links(response):
			if self.allowed_domains[0] in link.url:
				yield Request(link.url, callback=self.final_contents)
Ejemplo n.º 54
0
 def parse_urls(self, response):
     extractor = LinkExtractor(restrict_xpaths=('//div[contains(@class, "news_type2")]/h2',))
     links = extractor.extract_links(response)
     for link in links:
         url = link.url
         yield Request(url, callback=self.parse_items)