コード例 #1
0
    def parse(self, response):
        movies_1 = response.xpath('//td[@width="313"]').extract()[0]
        movie_links1 = Selector(text=movies_1).xpath('//a/@href').extract()

        movies_2 = response.xpath('//td[@width="252"]').extract()[0]
        movie_links2 = Selector(text=movies_2).xpath('//a/@href').extract()

        movie_links1.extend(movie_links2)
        for link in movie_links1:
            next_link = 'http://www.onthesetofnewyork.com/' + str(link)
            yield Request(next_link, callback=self.other_parse_meth)
コード例 #2
0
ファイル: stack_spider.py プロジェクト: XLab-Tongji/OpenStack
    def parse(self, response):
        # GET URL
        # questions = Selector(response).xpath('//div[@class="summary"]/h3')
        #
        # for question in questions:
        #     item = QLinkItem()
        #     item['title'] = question.xpath(
        #         'a[@class="question-hyperlink"]/text()').extract()[0]
        #     item['url'] = question.xpath(
        #         'a[@class="question-hyperlink"]/@href').extract()[0]
        #     yield item
        # questions = Selector(response).xpath('//div[@class="summary"]/h3')

        ###### get q info
        # item = QuestionItem()
        # title = Selector(response).xpath('//div[@id="question-header"]/h1/a/text()').extract()[0]
        # url = Selector(response).xpath('//div[@id="question-header"]/h1/a/@href').extract()[0]
        # mainbar = Selector(response).xpath('//div[@id="question"]')
        # # print mainbar
        # votes = Selector(response).xpath('//div[@id="question"]/table/tr/td/div/span/text()').extract()[0]
        # desc = str(Selector(response).xpath('//td[@class="postcell"]/div/div[@class="post-text"]').extract()[0]).strip()
        # tags = Selector(response).xpath('//td[@class="postcell"]/div/div[@class="post-taglist"]/a/text()').extract()
        # start_time = Selector(response).xpath('//td[@class="post-signature owner"]/div/div/span[@class="relativetime"]/@title').extract()[0]
        # update_time = Selector(response).xpath('//div[@class="user-action-time"]/span[@class="relativetime"]/@title').extract()[0]
        # user_id = Selector(response).xpath(
        #     '//td[@class="post-signature owner"]/div/div[@class="user-details"]/a/@href').extract()
        # item['source'] = 'stackoverflow'
        # item['title'] = title
        # item['url'] = url
        # item['votes'] = votes
        # item['desc'] = desc
        # item['tag'] = tags
        # item['user_id'] = user_id
        # item['start_time'] = start_time
        # item['update_time'] = update_time
        # yield item

        ##### get answer info
        answers = Selector(response).xpath('//div[@class="answer"]')
        ac_answer = Selector(response).xpath('//div[@class="answer accepted-answer"]')
        answers.extend(ac_answer)
        url = Selector(response).xpath('//div[@id="question-header"]/h1/a/@href').extract()[0]
        print "!!!!!!!!!!!"
        print len(answers)
        for answer in answers:
            item = AnswerItem()
            votes = answer.xpath('table/tr/td/div/span/text()').extract()[0]
            desc = answer.xpath('table/tr/td[@class="answercell"]/div[@class="post-text"]').extract()[0]
            user_ids = answer.xpath('table/tr/td[@class="answercell"]/table/tr/td[@class="post-signature"]/div/div[@class="user-details"]/a/@href').extract()
            action_time = answer.xpath('table/tr/td[@class="answercell"]/table/tr/td[@class="post-signature"]/div/div[@class="user-action-time"]/span/@title').extract()
            print action_time
            if len(action_time) == 1:
                start_time = action_time[0]
                update_time = action_time[0]
            elif len(action_time) == 2:
                start_time = action_time[1]
                update_time = action_time[0]
            if len(user_ids) == 1:
                user_id = user_ids[0]
            elif len(user_ids) == 2:
                user_id = user_ids[1]
            else:
                user_id = ''
            item['votes'] = votes
            item['desc'] = desc
            item['user_id'] = user_id
            item['start_time'] = start_time
            item['update_time'] = update_time
            item['url'] = url
            yield item
コード例 #3
0
    def parse_link(self, response):
        """Find all resources that coincide with what was specified and also 
        find all followable links.
        
        Arguments:
            response -- returned Request object

        Returns:
            generator -- yields Resource Items followed by followable Requests

        """
        url = response.url
        if url in self.parsed:
            return
        else:
            self.parsed.add(url)

        mimetype = response.headers['Content-Type']
        base_url, base_path = ResourceSpider.get_baseurl(url)

        # Extract links.
        if 'text/javascript' in mimetype:
            # Perhaps other cases as well, but I've seen JS text files be
            # parsed when it contained HTML.
            resources = Selector()
        else:
            try:
                # Consider including background-image found in [style], however
                # only inline styles can be retrieved.  Don't forget to extract
                # 'url(' prefix and ')' suffix when implemented!
                resources = response.css('[href],[src]')
            except AttributeError as e:
                resources = Selector()
        # We only want to crawl normal stuff.  hrefs will be used to filter
        # returned Requests.
        hrefs = resources.css('[href]::attr(href)').extract()
        resources = resources.css('[src]::attr(src)').extract()
        resources.extend(hrefs)

        requests = set()
        for link in resources:
            link = link.strip()
            linkp = urlparse(link)
            if linkp.scheme in ('mailto', 'tel') or link.startswith(
                ('#', 'mailto:', 'tel:')):
                # URLs to ignore.
                continue
            elif not urlparse(link).netloc:
                # Fix a relative URL.
                link = base_url + os.path.join('/', base_path, link)
            elif not urlparse(link).scheme:
                # Fix URL scheme.
                link = 'http://%s' % link

            # Determine if examined this URL before.
            mimetype = size = None
            if link not in self.seen:
                self.seen.add(link)
                # Get URL header information: mimetype, size
                if self.optimize:
                    mimetype, encoding = mimetypes.guess_type(link)
                if mimetype is None:
                    mimetype, size = self.get_header_info(link)

            if mimetype:
                # Yield Items.
                if any(mt in mimetype
                       for mt in self.mimetypes) and link not in self.found:
                    size = ResourceSpider.bytes2human(
                        int(size) if size is not None else size)
                    count = len(self.found) + 1
                    if count == 1:
                        log.msg('%5s %-16s %-8s %-64s' %
                                ('COUNT', 'MIMETYPE', 'SIZE', 'REFERRER'),
                                level=log.INFO)
                    log.msg('%4d: %-16s %-8s %-64s' %
                            (count, mimetype, size, link),
                            level=log.INFO)
                    # MIME type format example: 'text/html; charset=utf-8'
                    self.found.add(link)
                    yield ResourceItem(url=link,
                                       mimetype=mimetype,
                                       size=size,
                                       referrer=url)
                # Build Requests.
                if self.follow and any(href.strip() in link for href in hrefs):
                    requests.add(link)

        # Yield Requests after having yielded Items.
        for link in requests:
            if link not in self.requested and self.isallowed(link):
                self.requested.add(link)
                yield Request(link, callback=self.parse_link)