Example #1
0
    def parse_sesja(self, response):
        # uchwaly
        uchwaly_le = LinkExtractor(allow=FindReportsSpider.UCHWALA_RE, restrict_xpaths="//table")
        links = uchwaly_le.extract_links(response)
        self.print_links("uchwaly", links)
        cnt = 0
        for link in links:
            yield scrapy.Request(link.url, callback=self.parse_uchwala)
            k = items.PageItem()
            k["text"] = link.text.encode("utf8")
            k["url"] = link.url
            k["ref"] = response.url
            k["order"] = cnt
            yield k
            if cnt >= DEBUG_CNT and DEBUG:
                break
            cnt += 1

        # files (glosowania, obecnosc)
        le = LinkExtractor(allow=FindReportsSpider.PLIK_RE)
        links = le.extract_links(response)
        self.print_links("glosowania", links)
        cnt = 0
        for link in links:
            fi = items.FiledownloadItem()
            fi["file_urls"] = [link.url]
            fi["text"] = link.text.encode("utf8")
            fi["url"] = link.url
            fi["ref"] = response.url
            fi["order"] = cnt
            yield fi
            if cnt >= DEBUG_CNT and DEBUG:
                break
            cnt += 1
Example #2
0
 def parse(self, response):
     #提取书籍页面中每本书的链接
     le = LinkExtractor(restrict_css='article.product_pod h3')
     for link in le.extract_links(response):
         yield scrapy.Request(link.url, callback=self.parse_book)
         
     #提取下一页的链接
     le =  LinkExtractor(restrict_css='ul.pager li.next')
     links = le.extract_links(response)
     if links:
         next_url = links[0].url
         yield scrapy.Request (next_url, callback=self.parse)
Example #3
0
class MySpider(scrapy.Spider):
    # Your spider definition
    name="fetch_data"

    def __init__(self, *args, **kwargs):
    	super(MySpider, self).__init__(*args, **kwargs)
        self.start_urls = [kwargs.get('start_url')]
        self.link_extractor = LinkExtractor()
        urls = self.start_urls

    def parse(self, response):
    	item = WebpageScraperItem()
        
        item['key'] = self.start_urls
    	item['title'] = response.xpath('//title/text()').extract()
    	item['paragraphs'] = response.xpath('//p/text()').extract()
    	item['headings'] = response.xpath('//h1/text()').extract()
        
        links = self.link_extractor.extract_links(response)
        item['links'] = [x.url for x in links]
        
        img_urls = []
        img_url = response.xpath('//img/@src').extract()
        for img in img_url:
            parse_url = urlparse.urlparse(img)
            parsed_url = parse_url._replace(**{"scheme":"http"})
            img_urls.append(parsed_url.geturl())
    	
        item['image_urls'] = img_urls
        return item
Example #4
0
    def parse_state(self, response):
        """ Yields a scrapy.Request object for each city with a store in the state """
        state_url = 'stores.joann.com/{}*'.format(response.meta['state'])
        extractor = LinkExtractor(allow=state_url)

        for link in extractor.extract_links(response):
            yield scrapy.Request(link.url, callback=self.parse_city, headers=HEADERS)
Example #5
0
class BCSpider(Spider):
    name = 'bc'

    def __init__(self, *args, **kwargs):
        super(BCSpider, self).__init__(*args, **kwargs)
        self.le = LinkExtractor()

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return

        for link in self.le.extract_links(response):
            r = Request(url=link.url)
            r.meta.update(link_text=link.text)
            yield r

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(BCSpider, cls).from_crawler(crawler, *args, **kwargs)
        spider._set_crawler(crawler)
        spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
        return spider

    def spider_idle(self):
        self.log("Spider idle signal caught.")
        raise DontCloseSpider
Example #6
0
 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         yield scrapy.Request(link.url, self.parse_link, meta={
             'splash': {
                 'args': {'har': 1, 'html': 0},
             }
         })
 def parse_link(self, response):
     # log
     self.logger.info('Hi, this is an item page! %s', response.url)
     # parse link
     linkExtractor = LinkExtractor(allow=r".+\.shtml", restrict_css='div.list > ul', unique=True)
     links = linkExtractor.extract_links(response)
     for link in links:
         yield scrapy.Request(link.url, callback=self.parse_content)
Example #8
0
 def parse(self,response):
     extractor = LinkExtractor(allow="/article/*")
     links = extractor.extract_links(response)
     for link in links:
         item = XiubaiItem()
         req = Request(link.url, self.parse_detail_page)
         req.meta['item'] = item
         yield req
Example #9
0
 def parse(self, response):
     # ❶取得したページのURLを出力
     print(response.url)
     # ❷ページ内のリンクを抽出するLinkExtractorオブジェクトを作成
     le = LinkExtractor()
     # ❸ページ内のリンクを抽出
     for link in le.extract_links(response):
         # ❹抽出したリンクからRequestオブジェクトを生成して返す
         yield response.follow(link.url, self.parse)
Example #10
0
    def parse(self, response):
        le_area = LinkExtractor(
            allow=r'/[a-zA-Z0-9]+/$',
            restrict_xpaths="//div[@data-role='ershoufang']/div[1]")
        links = le_area.extract_links(response)

        # 爬取区域信息
        for link in links:
            yield scrapy.Request(link.url, callback=self.parse_node)
Example #11
0
 def parse(self, response):
     keyword = self._parse_keyword(response.url)
     le = LinkExtractor(allow=r'http://detail.zol.com.cn/cell_phone/.*')
     for link in le.extract_links(response):
         item = BaiduSearchItem()
         item['keyword'] = keyword
         item['link'] = link.url
         item['name'] = link.text
         yield item
Example #12
0
    def parse_state(self, response):
        """ Yields a scrapy.Request object for each city with a store in the state """
        state_url = 'stores.joann.com/{}*'.format(response.meta['state'])
        extractor = LinkExtractor(allow=state_url)

        for link in extractor.extract_links(response):
            yield scrapy.Request(link.url,
                                 callback=self.parse_city,
                                 headers=HEADERS)
Example #13
0
File: jd.py Project: YasinL/tb_shop
 def parse(self,response):
     # 解析list链接
     pattern = "https://list\.jd\.com/list\.html\?cat=.*"
     le = LinkExtractor(allow=pattern)
     links = le.extract_links(response)
     print("发现list页面共:【%s】" %len(links))
     for i in links:
         print("-------------------->%s" %i.url)
         yield scrapy.Request(i.url,callback=self.next_page)
 def parse(self, response):
     link = LinkExtractor(allow=r'http://www\.taitung\.gov\.tw/opendata/OD_OpenData_DealData.aspx\?s=\w+')
     links = link.extract_links(response)
     for lin in links:
         i = {}
         i["link"] = lin.url
         i["title"] = lin.text
         response = requests.get(i["link"])
         self.getDataByRequest(response,i)
Example #15
0
 def parse_region(self, response):
     print('parse_region response.url:' + response.url)
     self.logger.debug('parse_region response.url:' + response.url)
     yield Request(response.url, callback=self.parse_list)
     le = LinkExtractor(restrict_css='div.item-list.area-bd > div.filter-sub')
     print('2' * 40)
     for link in le.extract_links(response):
         print(link, link.url, link.text)
         yield Request(link.url, callback=self.parse_price)
Example #16
0
 def parse(self, response):
     le = LinkExtractor(restrict_css="div.toctree-wrapper.compound",
                        deny='/index.html$')
     # print(len(le.extract_links(response)))
     for link in le.extract_links(response):
         yield scrapy.Request(
             link.url,
             callback=self.parse_detail,
         )
Example #17
0
 def parse(self, response):
     print('parse response.url:' + response.url)
     self.logger.debug('parse response.url:' + response.url)
     yield Request(response.url, callback=self.parse_list)
     le = LinkExtractor(restrict_css='.search-area-detail')
     print('1' * 20)
     for link in le.extract_links(response):
         print(link, link.url, link.text)
         yield Request(link.url, callback=self.parse_region)
 def parse(self, response):
     e = LinkExtractor()
     urls = [link.url for link in e.extract_links(response)]
     for url in urls:
         parsed = urlparse.urlsplit(url)
         qs = urlparse.parse_qs(parsed.query)
         if qs and 'Url' in qs:
             event_url = qs['Url'][0]
             yield self.add_url(event_url)
Example #19
0
 def parse(self, response):
     #le = LinkExtractor(restrict_css='div.toctree-wrapper.compound li.toctree-l2')
     le = LinkExtractor(
         restrict_css='div.toctree-wrapper.compound li.toctree-l1',
         deny='/index.html$')
     #pdb.set_trace()
     for link in le.extract_links(response):
         #print link.url
         yield scrapy.Request(link.url, callback=self.parse_url)
Example #20
0
    def parse(self, response):
        story_link_regex = 'http://prntly\.com/[0-9]{,4}/[0-9]{,2}/[0-9]{,2}/[a-z\-]+/'
        page_link_regex = 'http://prntly.com/page/[0-9]+/'

        story_link_extractor = LinkExtractor(canonicalize=True,
                                             unique=True,
                                             allow=story_link_regex)
        story_links = story_link_extractor.extract_links(response)
        with open('prntly.com.txt', 'a') as f:
            for link in story_links:
                f.write(link.url + '\n')

        page_link_extractor = LinkExtractor(canonicalize=True,
                                            unique=True,
                                            allow=page_link_regex)
        page_links = page_link_extractor.extract_links(response)
        for link in page_links:
            yield scrapy.Request(url=link.url, callback=self.parse)
Example #21
0
 def parse_region(self, response):
     print('parse_region response.url:' + response.url)
     self.logger.debug('parse_region response.url:' + response.url)
     yield Request(response.url, callback=self.parse_list)
     le = LinkExtractor(restrict_css='#region-nav-sub')
     print('2' * 100)
     for link in le.extract_links(response):
         print(link, link.url, link.text)
         yield Request(link.url, callback=self.parse_classfy)
Example #22
0
 def parse(self, response):
     le = LinkExtractor(restrict_xpaths='/html/body/div[2]/div[4]/ul/li/a')
     le_list = le.extract_links(response)
     count = len(le_list)
     for i in range(count):
         print(le_list[i - 1])
         yield scrapy.Request(le_list[i - 1].url,
                              callback=self.parsemore,
                              dont_filter=True)
Example #23
0
 def parse_hall(self, response):
     print('parse_hall response.url:' + response.url)
     self.logger.debug('parse_hall response.url:' + response.url)
     yield Request(response.url, callback=self.parse_list)
     le = LinkExtractor(restrict_css='div.filter-mod > div:nth-child(3) > div')
     print('4' * 160)
     for link in le.extract_links(response):
         print(link, link.url, link.text)
         yield Request(link.url, callback=self.parse_list)
Example #24
0
    def parse(self, response):
        if response.status != 200 or response.body == "":
            return

        ads_links = response.xpath("//a[img]")
        for ads_link in ads_links:
            link_href = ads_link.xpath("@href").extract_first()
            if self._from_same_site(response.url, link_href):
                continue

            ads_profile = AdsProfileItem()
            ads_profile["ads_host"] = response.url
            ads_profile["ads_present_mode"] = "normal_1"
            ads_profile["ads_target_url"] = link_href
            img_src = response.urljoin(ads_link.xpath("img/@src").extract_first())
            ads_profile["ads_content_url"] = img_src
            ads_profile["ads_content_frame"] = ""
            ads_profile["ads_host_domain"] = urlparse(response.url).netloc
            ads_profile["ads_target_domain"] = urlparse(link_href).netloc
            yield ads_profile

        if isinstance(response, SplashJsonResponse):
            if "childFrames" in response.data:
                frames = self._get_all_child_frames(response)
                print "Get %s childFrames in %s" % (len(frames), response.url)
                for frame_response in frames:
                    if not self._is_valid_frame(frame_response.url):
                        continue
                    ads_links = frame_response.xpath("//a[img]")
                    for ads_link in ads_links:
                        link_href = ads_link.xpath("@href").extract_first()
                        if self._from_same_site(response.url, link_href):
                            continue

                        ads_profile = AdsProfileItem()
                        ads_profile["ads_host"] = response.url
                        ads_profile["ads_present_mode"] = "normal_1"
                        ads_profile["ads_target_url"] = link_href
                        img_src = frame_response.urljoin(ads_link.xpath("img/@src").extract_first())
                        ads_profile["ads_content_url"] = img_src
                        ads_profile["ads_content_frame"] = frame_response.url
                        ads_profile["ads_host_domain"] = urlparse(response.url).netloc
                        ads_profile["ads_target_domain"] = urlparse(link_href).netloc
                        yield ads_profile

        link_extractor = LinkExtractor()
        all_links = link_extractor.extract_links(response)
        for link in all_links:
            request = SplashRequest(
                response.urljoin(link.url),
                self.parse,
                endpoint="render.json",
                slot_policy=SlotPolicy.PER_DOMAIN,
                args={"html": 1, "iframes": 1},
            )
            request.headers.setdefault("User-Agent", self.ua_generater.get_user_agent())
            yield request
Example #25
0
 def parse(self, response):
     for le in response.css('.content'):
         url = le.xpath('./h2/a/@href').extract_first()
         yield scrapy.Request(url, callback=self.pares_detail)
     le = LinkExtractor(restrict_css='.current+a')
     links = le.extract_links(response)
     if links:
         next_url = links[0].url
         yield scrapy.Request(next_url, callback=self.parse)
Example #26
0
 def parse_classfy(self, response):
     print('parse_classfy response.url:' + response.url)
     self.logger.debug('parse_classfy response.url:' + response.url)
     yield Request(response.url, callback=self.parse_list)
     le = LinkExtractor(restrict_css='#classfy')
     print('3' * 150)
     for link in le.extract_links(response):
         print(link, link.url, link.text)
         yield Request(link.url, callback=self.parse_list)
Example #27
0
 def parse_nvyou_info(self, response):
     le = LinkExtractor(restrict_xpaths='//*[@id="waterfall"]')
     links = le.extract_links(response)
     for link in links:
         yield scrapy.Request(url=link.url, callback=self.parse_info)
     nextpage = response.xpath('//*[@id="next"]/@href').extract_first()
     if nextpage:
         nextpage = self.web + nextpage
         yield scrapy.Request(url=nextpage, callback=self.parse_nvyou_info)
Example #28
0
    def parse(self, response):
        '''
        解析页面中每本书的详细连接,以及下一页的连接
        :param response:
        :return:
        '''
        # 获取每本书的详细连接
        le = LinkExtractor(restrict_css='article.product_pod h3')
        links = le.extract_links(response)
        if links:
            for link in links:
                yield scrapy.Request(url=link.url, callback=self.parse_book)

        # 获取下一页的连接, 就一个链接
        le = LinkExtractor(restrict_xpaths='//li[@class="next"]/a')
        if le.extract_links(response):
            next_url = le.extract_links(response)[0].url
            yield scrapy.Request(url=next_url, callback=self.parse)
Example #29
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)
        item = SportsItem()

        li = response.css('#shop-all-list>ul>li')
        print('parse_list li:{}  response.url: {}'.format(
            li.css('.txt>.tit h4::text').extract(), response.url))
        self.logger.debug('parse_list li:{}  response.url: {}'.format(
            li.css('.txt>.tit h4::text').extract(), response.url))
        for i in li:
            item['title'] = i.css('.txt>.tit h4::text').extract_first().strip()
            item['url'] = i.css('.txt>.tit>a::attr(href)').extract_first()
            if i.css('.shop-branch::text'):
                item['branch'] = i.css(
                    '.shop-branch::attr(href)').extract_first()
            item['img'] = i.css('img::attr(data-src)').extract_first()
            item['star'] = float(
                i.css('.sml-rank-stars::attr(class)').re_first(
                    r'[1-9]\d*|0')) / 10
            if i.css('.review-num b::text'):
                print('review-num : {}'.format(
                    i.css('.review-num>b::text').extract_first()))
                item['review_num'] = int(
                    i.css('.review-num>b::text').extract_first())
            if i.css('.mean-price b::text'):
                item['mean_price'] = int(
                    i.css('.mean-price b::text').extract_first().strip('¥'))
            print('1111111 score environment service: {}'.format(
                i.css('.comment-list b::text').extract()))
            if i.css('.comment-list b::text').extract():
                print('222222 score environment service: {}'.format(
                    i.css('.comment-list b::text').extract()))
                item['score'] = float(
                    i.css('.comment-list b::text').extract()[0])
                item['environment'] = float(
                    i.css('.comment-list b::text').extract()[1])
                item['service'] = float(
                    i.css('.comment-list b::text').extract()[2])
            print('type location 1: {}'.format(
                i.css('.tag-addr span::text').extract()))
            item['type'] = i.css('.tag-addr span::text').extract()[0].strip()
            item['location'] = i.css(
                '.tag-addr span::text').extract()[1].strip()
            item['address'] = i.css('.addr::text').extract_first().strip()
            getlocation(item)
            item['number'] = item['url'].split('/')[-1]
            yield item

        le = LinkExtractor(restrict_css='div.page > a.next')
        print('4' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('next_url:', next_url)
            self.logger.debug('next_url:' + next_url)
            yield Request(next_url, callback=self.parse_list)
Example #30
0
class LinkSpider(scrapy.Spider):
    name = 'links'

    def __init__(self, *args, **kwargs):

        self.start_url = kwargs['start_urls'][0]
        self.data_paths = get_data_paths(self.start_url)
        self.main_domain = [self.data_paths['domain']]
        self.start_urls = [self.data_paths['base_url']]
        kwargs['start_urls'] = self.start_urls
        self.max_to_scrap = int(kwargs.get('max_to_scrap', 10))
        print(f'Start url: {self.start_urls[0]}, Domain: {self.main_domain}, Max to positives: {self.max_to_scrap}')

        #self.exported_data_path = os.path.join(DATA_OUTPUT_PATH, domain)
        #elf.main_domain = kwargs.get('main_domain')
        self.num_scraped = 0
        self.link_extractor = LinkExtractor(allow_domains=self.main_domain)
        super(LinkSpider, self).__init__(*args, **kwargs)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(LinkSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped)
        return spider

    def item_scraped(self, item):
        self.num_scraped = self.num_scraped + 1

    #def closed(self, reason):
    #    Path(self.data_paths['done']).touch()

    def parse(self, response):
        if self.num_scraped >= self.max_to_scrap:
            # Stop yielding request and items to stop the crawling
            return
        depth = response.meta['depth']
        print(f'current url: {response.url}, depth: {depth}')
        headers = response.headers
        # print(headers['Content-Type'])
        if 'text/html' in str(headers.get('Content-Type', '')):
            # Link extractor by default avoids most extensions
            # but sometimes extensions are not part of the URL
            # For now I put all under this if, but a better way is to
            # implement a Download Middleware (way to go is add more logic to avoid certain content)
            extracted_links = self.link_extractor.extract_links(response)
            for link in extracted_links:
                item = LinkItem()
                item['link'] = link.url
                item['text'] = re.sub(r'\s+', ' ', link.text)
                item['depth'] = depth
                yield item
                # print(f'extracted_link: {link.url}, text: {link.text}')
                # Here we can add more conditions to discard URLs we don't want
                # explore further.
                yield scrapy.Request(
                    response.urljoin(link.url),
                    callback=self.parse)
 def parse_region(self, response):
     print('parse_region response.url:' + response.url)
     self.logger.debug('parse_region response.url:' + response.url)
     yield Request(response.url, callback=self.parse_list)
     le = LinkExtractor(restrict_css='#qySelectSecond')
     print('2' * 40)
     for link in le.extract_links(response):
         print(link, link.url, link.text)
         yield Request(link.url, callback=self.parse_list)
Example #32
0
 def parse_region(self, response):
     print('parse_region response.url:' + response.url)
     self.logger.debug('parse_region response.url:' + response.url)
     yield Request(response.url, callback=self.parse_list_first)
     le = LinkExtractor(restrict_css='div[data-role="ershoufang"] > div:nth-child(2)')
     print('2' * 40)
     for link in le.extract_links(response):
         print(link, link.url, link.text)
         yield Request(link.url, callback=self.parse_list_first)
Example #33
0
 def parse(self, response):
     print('parse response.url:' + response.url)
     self.logger.debug('parse response.url:' + response.url)
     yield Request(response.url, callback=self.parse_list)
     le = LinkExtractor(restrict_css='.sub-filter-wrapper')
     print('1' * 50)
     for link in le.extract_links(response):
         print(link, link.url, link.text)
         yield Request(link.url, callback=self.parse_list_first)
Example #34
0
class OffersPlusSpider(scrapy.Spider):
    name = 'offers-plus'
    allowed_domains = ['offers-plus.com']
    start_urls = ['http://offers-plus.com/']
    url = 'http://www.offers-plus.com/categories.php?category=Clothing-%2C-Shoes-%26-Apparel&page={}&sort=newest'
    headline_xpath = '//*[@id="ProductDetails"]/div/h2/text()'
    img_selector = '[class^="ProductThumb"]'
    img_xpath = 'a/img/@src'
    date_grid_selector = '[class^="ProductDetailsGrid"]'
    date_xpath = 'dd[5]/text()'
    domain = 'offers-plus.com'
    MAX_ENTRIES = settings.MAX_ENTRIES

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.extractor = LinkExtractor(
            allow=r'.*/products\.php\?product=.*',
            restrict_xpaths=['//*[@id="frmCompare"]/ul'],
            unique=True)

    def start_requests(self):
        meta = {'index': 1, 'count': 0}
        yield Request(self.url.format(meta['index']),
                      callback=self.parse_outer,
                      meta=meta)

    def parse_outer(self, response):
        count = response.meta['count']
        entries_links = self.extractor.extract_links(response)
        limit = self.MAX_ENTRIES - count
        limit = limit if limit > 0 else 0
        for entry in entries_links[:limit]:
            yield Request(entry.url, callback=self.parse_coupon)
        count += len(entries_links)
        if len(entries_links) == 0 or count > self.MAX_ENTRIES:
            return
        meta = {'index': response.meta['index'] + 1, 'count': count}
        yield Request(self.url.format(meta['index']),
                      callback=self.parse_outer,
                      meta=meta)

    def parse_coupon(self, response):
        item = {'domain': self.domain, 'url': response.url}
        item['headline'] = response.xpath(self.headline_xpath).extract()[0]
        item['date'] = self.get_date(response)
        item['img'] = response.css(self.img_selector).xpath(
            self.img_xpath).extract()[0]
        yield item

    def get_date(self, response):
        date_grid = response.css(self.date_grid_selector)
        date_text = date_grid.xpath(self.date_xpath).extract()[0]
        try:
            return parse(date_text).date()
        except TypeError:
            return None
Example #35
0
 def parse_region(self, response):
     print('parse_region response.url:' + response.url)
     self.logger.debug('parse_region response.url:' + response.url)
     yield Request(response.url, callback=self.parse_list)
     le = LinkExtractor(
         restrict_css='.items-mod > div:nth-child(1) > div > div.sub-items')
     print('2' * 40)
     for link in le.extract_links(response):
         print(link, link.url, link.text)
         yield Request(link.url, callback=self.parse_list)
Example #36
0
    def parse(self, response):
        le = LinkExtractor()
        user_profiles = []
        for link in le.extract_links(response):
            result = re.search(r'.*(http://www.last.fm/user/.*)', link.url)
            if result:
                user_profiles.append(result.group(1))

        for user_profile in user_profiles:
            print user_profile
Example #37
0
 def parse(self, response):
     print('parse response.url:' + response.url)
     self.logger.debug('parse response.url:' + response.url)
     yield Request(response.url, callback=self.parse_list)
     le = LinkExtractor(restrict_css='div.screen_al > ul > li:nth-child(1) > ul')
     print('1' * 20)
     for link in le.extract_links(response):
         print(link, link.url, link.text)
         self.logger.debug(link)
         yield Request(link.url, callback=self.parse_region)
Example #38
0
 def parse_region(self, response):
     print('parse_region response.url:' + response.url)
     self.logger.debug('parse_region response.url:' + response.url)
     yield Request(response.url, callback=self.parse_list_decoration)
     le = LinkExtractor(
         restrict_css='#J_shopsearch > div:nth-child(2) > div > ul')
     print('2' * 100)
     for link in le.extract_links(response):
         print(link, link.url, link.text)
         yield Request(link.url, callback=self.parse_classfy)
Example #39
0
 def parse(self, response):
     item = PageItem()
     extractor = LinkExtractor(allow_domains='davidwatson.org')
     links = extractor.extract_links(response)
     item['url'] = response.url
     item['html'] = response.body
     item['links'] = [link.url for link in links]
     for link in links:
         yield scrapy.Request(link.url, callback=self.parse)
     yield item
    def parse_second(self, response):
        print('2' * 20)
        first_category = response.meta.get('first_category')
        print('parse_second response.url:' + response.url)
        self.logger.debug('parse_second response.url:' + response.url)

        if response.xpath('//*[@id="navigation"]/ul/li[1]/div[1]/text()'
                          ).extract_first() == '分类':
            le = LinkExtractor(
                restrict_xpaths=
                '//*[@id="navigation"]/ul/li[@dd_name="分类"]/div[2]/div[1]/div')
            for link in le.extract_links(response):
                print(link.url, link.text)
                self.logger.debug('second_category {},{}'.format(
                    link.url, link.text))
                second_category = link.text
                yield Request(link.url,
                              callback=self.parse_third,
                              meta={
                                  'first_category': first_category,
                                  'second_category': second_category
                              })
        elif '价格' in response.xpath(
                '//*[@id="navigation"]/ul/li/@dd_name').extract():
            le = LinkExtractor(
                restrict_xpaths=
                '//*[@id="navigation"]/ul/li[@dd_name="价格"]/div[2]/div[1]/div')
            for link in le.extract_links(response):
                print(link.url, link.text)
                # third_category = link.text
                yield Request(link.url,
                              callback=self.parse_books,
                              meta={'first_category': first_category})
        else:
            le = LinkExtractor(
                restrict_xpaths=
                '//*[@id="navigation"]/ul/li[@dd_name="折扣"]/div[2]/div[1]/div')
            for link in le.extract_links(response):
                print(link.url, link.text)
                # third_category = link.text
                yield Request(link.url,
                              callback=self.parse_books,
                              meta={'first_category': first_category})
Example #41
0
 def parse_hall(self, response):
     print('parse_hall response.url:' + response.url)
     self.logger.debug('parse_hall response.url:' + response.url)
     yield Request(response.url, callback=self.parse_list)
     le = LinkExtractor(restrict_css='#list_D02_12 > ul')
     print('3' * 80)
     for link in le.extract_links(response):
         print(link, link.url, link.text)
         self.logger.debug(link)
         yield Request(link.url, callback=self.parse_list)
Example #42
0
    def parse_code(self, response):
        #提取source code的url
#        le = LinkExtractor(restrict_css='div.bodywrapper p', allow='matplotlib.org/examples')
#        link = le.extract_links(response)
        le = LinkExtractor(restrict_css='a.reference.external')
        link = le.extract_links(response)
        
        file = FilesItem()
        file['file_urls'] = [link[0].url]
        return file
Example #43
0
 def parse(self, response):
     name = 'example'
     lx = LinkExtractor()
     lst = lx.extract_links(response)  # List contains the list of jobs
     # Call the function which compares between lst and MongoDB. Return Boolean Value
     flag = compare(name, lst)
     # if True, call the function which send an email to users
     if flag:
         notify(name)
     else:
         print("No Update")
Example #44
0
 def parse(self, response):
     link_extractor = LinkExtractor()
     links = link_extractor.extract_links(response)
     for link in links:
         item = DomainItem()
         item['link'] = link.url
         item['domain'] = self.getHost(link.url)
         yield item
     for link in links:
         if (not db.scrapy_items.find_one({'link': link.url})):
             yield scrapy.Request(link.url, callback=self.parse)
Example #45
0
 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         yield SplashRequest(
             link.url,
             self.parse_link,
             endpoint='render.json',
             args={
                 'har': 1,
                 'html': 1,
             }
         )
 def parse(self, response):
     e = LinkExtractor()
     urls = [link.url for link in e.extract_links(response)]
     for url in urls:
         if response.url != url:
             yield self.add_url(url)
     if urls:
         qs = urlparse.parse_qs(urlparse.urlparse(response.url).query)
         qs = dict((k, v[0]) for (k, v) in qs.iteritems())
         qs['p'] = int(qs['p']) + 1
         url = 'http://comeon5678.com/event/list'
         yield scrapy.Request('%s?%s' % (url, urllib.urlencode(qs)))
Example #47
0
class GeneralSpider(Spider):
    name = "general"

    def __init__(self, *args, **kwargs):
        super(GeneralSpider, self).__init__(*args, **kwargs)
        self.le = LinkExtractor()

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return

        for link in self.le.extract_links(response):
            r = Request(url=link.url)
            r.meta.update(link_text=link.text)
            yield r
class FundaSpider(CrawlSpider):

    name = "funda_spider"
    allowed_domains = ["funda.nl"]

    def __init__(self, place='amsterdam'):
        self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page_number) for page_number in range(1,301)]
        self.base_url = "http://www.funda.nl/koop/%s/" % place
        self.le1 = LinkExtractor(allow=r'%s+(huis|appartement)-\d{8}' % self.base_url)

    def parse(self, response):
        links = self.le1.extract_links(response)
        for link in links:
            if link.url.count('/') == 6 and link.url.endswith('/'):
                item = FundaItem()
                item['url'] = link.url
                if re.search(r'/appartement-',link.url):
                    item['property_type'] = "apartment"
                elif re.search(r'/huis-',link.url):
                    item['property_type'] = "house"
                yield scrapy.Request(link.url, callback=self.parse_dir_contents, meta={'item': item})

    def parse_dir_contents(self, response):
        new_item = response.request.meta['item']
        title = response.xpath('//title/text()').extract()[0]
        postal_code = re.search(r'\d{4} [A-Z]{2}', title).group(0)
        city = re.search(r'\d{4} [A-Z]{2} \w+',title).group(0).split()[2]
        address = re.findall(r'te koop: (.*) \d{4}',title)[0]
        price_dd = response.xpath("//dt[contains(.,'Vraagprijs')]/following-sibling::dd[1]/text()").extract()[0]
        price = re.findall(r' \d+.\d+', price_dd)[0].strip().replace('.','')
        year_built_dd = response.xpath("//dt[contains(.,'Bouwjaar')]/following-sibling::dd[1]/text()").extract()[0]
        year_built = re.findall(r'\d+', year_built_dd)[0]
        area_dd = response.xpath("//dt[contains(.,'Woonoppervlakte')]/following-sibling::dd[1]/text()").extract()[0]
        area = re.findall(r'\d+', area_dd)[0]
        rooms_dd = response.xpath("//dt[contains(.,'Aantal kamers')]/following-sibling::dd[1]/text()").extract()[0]
        rooms = re.findall('\d+ kamer',rooms_dd)[0].replace(' kamer','')
        bedrooms = re.findall('\d+ slaapkamer',rooms_dd)[0].replace(' slaapkamer','')

        new_item['postal_code'] = postal_code
        new_item['address'] = address
        new_item['price'] = price
        new_item['year_built'] = year_built
        new_item['area'] = area
        new_item['rooms'] = rooms
        new_item['bedrooms'] = bedrooms
        new_item['city'] = city
        yield new_item
Example #49
0
    def parse_main(self, response):
        le = LinkExtractor(allow=KADENCJA_RE)
        links = le.extract_links(response)
        self.print_links("kadencje", links)
        cnt = 0

        for link in links:
            yield scrapy.Request(link.url, callback=self.parse_kadencja)
            k = items.PageItem()
            k["text"] = link.text.encode("utf8")
            k["url"] = link.url
            k["ref"] = response.url
            k["order"] = cnt
            yield k
            if cnt >= DEBUG_CNT and DEBUG:
                break
            cnt += 1
 def print_url(self, response):
     """
         @url http://www.ura.org.hk/en/schemes-and-policies/redevelopment/ura-implemented-projects/reimbursement.aspx
         @returns items 1 1
         @returns requests 0 0
         @scrapes title link html text last_updated file_urls
     """
     l = ItemLoader(item=UrbanRenewalItem(), response=response)
     l.add_xpath('title', '//title')
     l.add_value('link', response.url)
     l.add_xpath('text', '//div[@id="content"]')
     l.add_xpath('html', '/html')
     l.add_xpath('last_updated', '//div[@class="lastUpdated"]')
     lx = LinkExtractor(allow=['\.' + ext for ext in file_extension],
                        deny_extensions=())
     l.add_value('file_urls', [link.url for link in lx.extract_links(response)])
     return l.load_item()
Example #51
0
    def parse(self, response):
        for sel in response.css('article.product_pod'):
            book = BookstoresItem()
            book['name'] = sel.xpath('./h3/a/@title').extract_first()
            book['price'] = sel.css('p.price_color::text').extract_first()
            yield book
            
        # 提取链接
#        next_url = response.css('ul.pager li.next a::attr(href)').extract_first()
#        if next_url:
#            next_url = response.urljoin(next_url)
#            yield scrapy.Request(next_url,callback=self.parse)
        le = LinkExtractor(restrict_css='ul.pager li.next' ) 
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            yield scrapy.Request(next_url,callback=self.parse)
Example #52
0
 def parse_uchwala(self, response):
     # generate list of files to download
     le = LinkExtractor(allow=FindReportsSpider.PLIK_RE)
     links = le.extract_links(response)
     self.print_links("files", links)
     cnt = 0
     for link in links:
         fi = items.FiledownloadItem()
         fi["file_urls"] = [link.url]
         fi["text"] = link.text.encode("utf8")
         fi["url"] = link.url
         fi["ref"] = response.url
         fi["order"] = cnt
         yield fi
         if cnt >= DEBUG_CNT and DEBUG:
             break
         cnt += 1
Example #53
0
    def parse(self, response):
        print(response.url)

        # Extract internal links from webpage
        IGNORED_EXTENSIONS.append('gz')
        IGNORED_EXTENSIONS.append('tar')
        urlextract = LinkExtractor(allow_domains=self.allowed_domains)

        # Store internal links
        links = urlextract.extract_links(response)
        links = [l.url for l in links]
        if response.url not in self.data:
            self.data[response.url] = links
        yield

        # Follow internal links
        for url in links:
            yield scrapy.Request(url, self.parse)
Example #54
0
 def parse_kadencja(self, response):
     #    'LIX Sesja Rady Miasta 24 września 2014 r.'
     #    'http://www.bip.olsztyn.eu/bip/dokument/305103/lix_sesja_rady_miasta_24_wrzesnia_2014_r_/'
     le = LinkExtractor(allow=FindReportsSpider.SESJA_RE)
     links = le.extract_links(response)
     self.print_links("sesje", links)
     cnt = 0
     for link in links:
         yield scrapy.Request(link.url, callback=self.parse_sesja)
         k = items.PageItem()
         k["text"] = link.text.encode("utf8")
         k["url"] = link.url
         k["ref"] = response.url
         k["order"] = cnt
         yield k
         if cnt >= DEBUG_CNT and DEBUG:
             break
         cnt += 1
Example #55
0
 def parse_item(self, response):
     self.write_response(response.url, response)
     
     print("----------------------------------", response.real_url, response.url)
     
     le = LinkExtractor()
     for link in le.extract_links(response):
         splashRequestObj = SplashRequest(
             link.url,
             self.parse_item,
             endpoint='render.html',
             args={
                 'wait':0.8,
                 'html': 1,
             }
         )
         
         yield splashRequestObj
Example #56
0
 def parse(self, response):
     self.write_response(response.url, response)
     
     if not response.url.lower().find(r"cisco.com/en/us/docs") == -1 or not response.url.lower().find(r"cisco.com/c/en/us/td/docs") == -1  or not response.url.lower().find(r"register") == -1:
         return
     
     le = LinkExtractor()
     for link in le.extract_links(response):
         splashRequestObj = SplashRequest(
             link.url,
             self.parse,
             endpoint='render.html',
             args={
                 'wait':0.8,
                 'html': 1,
             }
         )
         
         yield splashRequestObj
class NumberOfPagesSpider(CrawlSpider):
    name = "number_of_pages"
    allowed_domains = ["funda.nl"]

    def __init__(self, place='amsterdam'):
        self.start_urls = ["http://www.funda.nl/koop/%s/" % place]
        self.le_maxpage = LinkExtractor(allow=r'%s+p\d+' % self.start_urls[0])
        rules = (Rule(self.le_maxpage, ),)

    def parse(self, response):
        links = self.le_maxpage.extract_links(response)
        max_page_number = 0                                                 # Initialize the maximum page number
        for link in links:
            if link.url.count('/') == 6 and link.url.endswith('/'):         # Select only pages with a link depth of 3
                page_number = int(link.url.split("/")[-2].strip('p'))       # For example, get the number 10 out of the string 'http://www.funda.nl/koop/amsterdam/p10/'
                if page_number > max_page_number:
                    max_page_number = page_number                           # Update the maximum page number if the current value is larger than its previous value
        filename = "max_pages.txt"                         # File name with as prefix the place name
        with open(filename,'wb') as f:
            f.write('max_page_number = %s' % max_page_number)               # Write the maximum page number to a text file
class GeneralSpider(Spider):
    name = 'general'

    def __init__(self, *args, **kwargs):
        super(GeneralSpider, self).__init__(*args, **kwargs)
        f = open("seeds_es_smp.txt")
        la = [urlparse(url.strip()).netloc for url in f.readlines()]
        f.close()
        self.la = la
        self.le = LinkExtractor()

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return

        for link in self.le.extract_links(response):
            netloc = urlparse(link.url).netloc
            if netloc in self.la:
                r = Request(url=link.url)
                r.meta.update(link_text=link.text)
                yield r
Example #59
0
  def parse_item(self, response):
    internal_item = InternalItem()
    internal_item["url"] = response.url
    yield internal_item

    #Use the inbuilt LinkExtractor to find urls, filtering out internal urls
    extractor_external = LinkExtractor(deny_domains=self.allowed_domains)
    external_links = extractor_external.extract_links(response)
    for link in external_links:
      external_item = ExternalItem()
      external_item["url"] = link.url
      yield external_item

    for src in response.css("img::attr('src')"):
      asset_item = AssetItem()
      asset_item["url"] = response.urljoin(src.extract())
      yield asset_item

    for src in response.css("script::attr('src')"):
      asset_item = AssetItem()
      asset_item["url"] = response.urljoin(src.extract())
      yield asset_item
Example #60
0
def extract_links(response, xpaths, tag=None, attr=None):
    """Extract links on a page matching given XPaths.

    :param response:    Scrapy response whose body contains links to extract
    :type response:     :class:`scrapy.http.Response`
    :param xpaths:      unique or iterable of XPath(s) matching links
                        to extract
    :type xpaths:       `unicode` or `iterable` of `unicode`
    :param tag:         tag name from which extract links
    :type tag:          `unicode`
    :param attr:        attribute name in :data:`tag` tag from which extract
                        links
    :type attr:         `unicode`
    :yield:             extracted links (canonicalized URLs), directly usable
                        as :data:`scrapy.http.Request.url` parameters
    :rtype:             `generator` orf `unicode`

    """
    # Construct LinkExtractor parameters
    extractor_attrs = {
        'restrict_xpaths': xpaths,
        'canonicalize': True,
        }
    if tag:
        extractor_attrs['tags'] = (tag,)
    if attr:
        extractor_attrs['attrs'] = (attr,)

    # Extract links
    link_extractor = \
        LinkExtractor(**extractor_attrs)
    links = link_extractor.extract_links(response)

    # Generate links
    for link in links:
        yield link.url