Ejemplo n.º 1
0
    def parse(self, response):
        for repos in response.css('div#user-repositories-list li'):
            item = ShiyanlouItem()
            item['name'] = repos.css(
                'div.d-inline-block.mb-1 h3 a::text').re_first('\S+')
            item['update_time'] = repos.xpath(
                './/div[@class="f6 text-gray mt-2"]/relative-time/@datetime'
            ).extract_first()

            # 构造每个仓库自己的页面,获取提交、分支、发布版本的数量
            request = response.follow(
                repos.css('div.d-inline-block.mb-1 h3 a::attr(href)')[0],
                callback=self.parse_info)
            # 将未完成的item通过meta传入parse_info
            request.meta['item'] = item

            # 继续产生新的请求
            yield request

        # process next link
        next_page = None
        next_url = response.css('div.pagination a::attr(href)').extract()
        next_text = response.css('div.pagination a::text').extract()
        for l, t in zip(next_url, next_text):
            if t == 'Next':
                next_page = l
        # next_page为当前页面链接的下一个的页面,如果是最后一个页面则节点有链接且为Next
        # 的节点没有,也即next_page为None
        if next_page:
            # 产生新的Request,回调parse来处理
            yield response.follow(next_page, callback=self.parse)
Ejemplo n.º 2
0
	def parse(self, response):
		for Reposity in response.css('li.col-12'):
			item =ShiyanlouItem()
			item['name'] = Reposity.css('div.d-inline-block a::text').re_first('\s*([-\w]+)\s*')
			item['update_time'] = Reposity.css('relative-time::attr(datetime)').extract_first()
		
			yield item
Ejemplo n.º 3
0
    def parse(self, response):
        repos = response.xpath('//li[@itemprop="owns"]')

        for repo in repos:
            item = ShiyanlouItem()
            item["repo_name"] = repo.xpath(
                ".//a[@itemprop='name codeRepository']/text()").extract_first(
                ).strip()
            item["update_time"] = repo.xpath(
                ".//relative-time/@datetime").extract_first()

            yield item

        # next page
        # spans = response.css('div.pagination span.disabled::text')

        # if len(spans) == 0 or spans[-1].extract() != 'Next':
        #     next_url = response.css('div.paginate-container a:last-child::attr(href)').extract_first()
        #     # next_url = response.css("div.pagiation-container a:last-child::attr(href)").extract_first()
        #     yield response.follow(next_url, callback=self.parse)

        pages = response.css("div.paginate-container a:last-child::text")

        if pages.extract_first() == "Next":
            next_url = response.css(
                "div.paginate-container a:last-child::attr(href)"
            ).extract_first()

            yield response.follow(next_url, callback=self.parse)
Ejemplo n.º 4
0
 def parse_item(self, response):
     item = ShiyanlouItem()
     for data in response.css('li.col-12'):
         item['repo_name'] = data.xpath(
             './/h3/a/text()').extract_first().strip()
         item['update_time'] = data.xpath(
             './/relative-time/@datetime').extract_first()
         yield item
Ejemplo n.º 5
0
 def parse(self, response):
     for i in response.css('li.col-12'):
         yield ShiyanlouItem({
             'name':
             i.css('a::text').extract_first().strip(),
             'update_time':
             i.css('relative-time::attr(datetime)').extract_first()
         })
Ejemplo n.º 6
0
 def parse(self, response):
     repos = response.xpath('//li[@itemprop="owns"]')
     for repo in repos:
         item = ShiyanlouItem()
         item['repo_name'] = repo.xpath(".//a[@itemprop='name codeRepository']/text()").extract_first().strip()
         item['update_time'] = repo.xpath(".//relative-time/@datetime").extract_first()
         
         yield item
 def parse(self, response):
     name_list = response.xpath('//*[@id="user-repositories-list"]/ul//div[1]/h3/a/text()').re(r'\s*(.+)')
     time_list = response.xpath('//*[@id="user-repositories-list"]/ul/li/div[3]/relative-time/@datetime').extract()
     for name,time in zip(name_list,time_list):
         item = ShiyanlouItem({
             'name':name,
             'update_time':time
             })
         yield item
Ejemplo n.º 8
0
 def parse(self, response):
     for course in response.css('li.col-12'):
         item = ShiyanlouItem({
             'name':
             course.xpath(".//h3/a/text()").extract_first().strip(),
             'update_time':
             course.xpath('.//relative-time/@datetime').extract_first()
         })
         yield item
Ejemplo n.º 9
0
 def parse(self, response):
     for url in response.css('li.col-12'):
         item = ShiyanlouItem()
         item['name'] = url.css('div.d-inline-block.mb-1 a::text').re_first('[^\s]+[\w]*')
         item['update_time'] = url.xpath('.//relative-time/@datetime').extract_first()
         git_url = response.urljoin(url.xpath('.//a/@href').extract_first())
         request = scrapy.Request(git_url, callback=self.parse_info)
         request.meta['item'] = item
         yield request
Ejemplo n.º 10
0
 def parse(self, response):
     for course in response.css('li.public'):
         item = ShiyanlouItem({
             'name':
             course.xpath('.//a[@itemprop="name codeRepository"]/text()').
             re_first("\n\s*(.*)"),
             'update_time':
             course.xpath('.//relative-time/@datetime').extract_first()
         })
         yield item
Ejemplo n.º 11
0
 def parse(self, response):
     for url in response.css('li.col-12'):
         item = ShiyanlouItem({
             'name':
             url.css('div.d-inline-block.mb-1 a::text').re_first(
                 '[^\s]+[\w]*'),
             'update_time':
             url.xpath('.//relative-time/@datetime').extract_first()
         })
         yield item
Ejemplo n.º 12
0
    def parse(self, response):
        repositories = response.xpath(
            '//*[@id="user-repositories-list"]/ul/li')
        for repository in repositories:
            item = ShiyanlouItem()
            item['repo_name'] = repository.xpath(
                './/a[@itemprop="name codeRepository"]/text()').extract_first(
                ).strip()
            item['update_time'] = repository.xpath(
                './/@datetime').extract_first().strip()

            yield item
Ejemplo n.º 13
0
 def parse(self, response):
     for github in response.css('#user-repositories-list ul li'):
         item = ShiyanlouItem()
         item['name'] = github.xpath('.//div[1]/h3/a/text()').re_first(
             r'\n\s*([\w_-]*)')
         item['update_time'] = github.xpath(
             './/div[3]/relative-time/@datetime').extract_first()
         git_url = response.urljoin(
             github.xpath('.//div[1]/h3/a/@href').extract_first())
         request = scrapy.Request(git_url, callback=self.parse_otherinfo)
         request.meta['item'] = item
         yield request
Ejemplo n.º 14
0
 def parse(self, response):
     for repository in response.css('li.public'):
         item = ShiyanlouItem()
         item['name'] = repository.xpath(
             './/a[@itemprop="name codeRepository"]/text()').re_first(
                 '\n\s*(.*)')
         item['update_time'] = repository.xpath(
             './/relative-time/@datetime').extract_first()
         repo_url = response.urljoin(
             repository.xpath('.//a/@href').extract_first())
         request = scrapy.Request(repo_url, callback=self.parse_repo)
         request.meta['item'] = item
         yield request
Ejemplo n.º 15
0
 def parse(self, response):
     repos = response.xpath('//div[@class="col-9 d-inline-block"]')
     for repo in repos:
         item = ShiyanlouItem({
             'repo_name':
             repo.xpath('.//h3/a/text()').re_first('\\n\\s*(.+)'),
             'update_time':
             repo.xpath('.//relative-time').re_first(
                 '<relative-time datetime="(.{20})')
         })
         yield item
     for url in response.xpath(
             '//div[@class="pagination"]/a[contains(text(),"Next")]'):
         yield response.follow(url, callback=self.parse)
Ejemplo n.º 16
0
 def parse(self, response):
     for course in response.css(
             'li[class="col-12 d-block width-full py-4 border-bottom public source"]'
     ):
         yield ShiyanlouItem({
             'name':
             course.css('div[class="d-inline-block mb-1"] a::text').
             re_first('[^\w]*(\w*)[^\w]*'),
             # 'update_time':response.xpath('div[@class="f6 text-gray mt-2"]/relative-time/@datetime').extract_first()
             'update_time':
             course.css(
                 'div[class="f6 text-gray mt-2"] relative-time::attr(datetime)'
             ).extract_first()
         })
Ejemplo n.º 17
0
 def parse(self, response):
     for course in response.css('li.public'):
         item = ShiyanlouItem()
         item['name'] = course.css(
             'div[class="d-inline-block mb-1"] a::text').re_first(
                 '[^\w]*(\w*)[^\w]*')
         # 'update_time':response.xpath('div[@class="f6 text-gray mt-2"]/relative-time/@datetime').extract_first()
         item['update_time'] = course.css(
             'div[class="f6 text-gray mt-2"] relative-time::attr(datetime)'
         ).extract_first()
         url = response.urljoin(course.xpath('.//a/@href').extract_first())
         request = scrapy.Request(url, callback=self.parse_repo)
         request.meta['item'] = item
         yield request
Ejemplo n.º 18
0
 def parse(self, response):
     for course in response.css('div.course-body'):
         item = ShiyanlouItem({
             'name':
             course.css('div.course-name::text').extract_first(),
             'description':
             course.css('div.course-desc::text').extract_first(),
             'type':
             course.css('div.course-footer span.pull-right::text').
             extract_first(default='mf'),
             'students':
             course.xpath('.//span[contains(@class, "pull-left")]/text()[2]'
                          ).re_first('[^\d]*(\d*)[^\d]*')
         })
     yield item
Ejemplo n.º 19
0
 def parse(self, response):
     for repo in response.css('li.col-12'):
         item = ShiyanlouItem()
         item['name'] = repo.xpath(
             './/div[@class="d-inline-block mb-1"]/h3/a/text()').re_first(
                 r'\s*(.+)')
         item['update_time'] = repo.xpath(
             './/div[@class="f6 text-gray mt-2"]/relative-time/@datetime'
         ).extract_first()
         repo_url = response.urljoin(
             repo.xpath('.//div[@class="d-inline-block mb-1"]/h3/a/@href').
             extract_first())
         request = scrapy.Request(repo_url, callback=self.parse_repo)
         request.meta['item'] = item
         yield request
Ejemplo n.º 20
0
 def parse(self, response):
     i = 0
     while i < 30:
         i += 1
         path = ('//*[@id="user-repositories-list"]/ul/li[{}]').format(i)
         for text in response.xpath(path):
             item = ShiyanlouItem({
                 'name': ("".join(
                     text.xpath(path +
                                '/div[1]/h3/a/text()').re('(.+)'))).strip(),
                 'update_time':
                 "".join(
                     text.xpath(path + '/div[3]/relative-time').re(
                         'datetime="(.+)"'))
             })
             yield item
Ejemplo n.º 21
0
 def parse(self, response):
     for course in response.css('li.public'):
         item = ShiyanlouItem()
         item['name'] = course.xpath(
             './/a[@itemprop="name codeRepository"]/text()').re_first(
                 r'\n\s*(.*)')
         item['update_time'] = course.xpath(
             './/relative-time/@datetime').extract_first()
         course_url = response.urljoin(
             course.xpath('.//a/@href').extract_first())
         request = scrapy.Request(course_url, callback=self.parse_content)
         request.meta['item'] = item
         yield request
     spans = response.css('div.pagination span.disabled::text')
     if len(spans) == 0 or spans[-1].extract() != 'Next':
         next_url = response.css(
             'div.pagination a:last-child::attr(href)').extract_first()
         yield response.follow(next_url, callback=self.parse)
Ejemplo n.º 22
0
    def parse(self, response):
        repos = response.xpath('//li[@itemprop="owns"]')
        for repo in repos:
            item = ShiyanlouItem()
            item['repo_name'] = repo.xpath(
                ".//a[@itemprop='name codeRepository']/text()").extract_first(
                ).strip()
            item['update_time'] = repo.xpath(
                ".//relative-time/@datetime").extract_first()

            yield item

        # 如果 Next 按钮没被禁用,那么表示有下一页
        spans = response.css('div.pagination span.disabled::text')
        if len(spans) == 0 or spans[-1].extract() != 'Next':
            next_url = response.css(
                'div.pagination a:last-child::attr(href)').extract_first()
            yield response.follow(next_url, callback=self.parse)
Ejemplo n.º 23
0
 def parse(self, response):
     i = 0
     print(response)
     for course in response.css(
             'li.col-12.d-block.width-full.py-4.border-bottom.public'):
         item = ShiyanlouItem({
             'name':
             course.css('div[class="d-inline-block mb-1"] h3 a::text'
                        ).re_first('\s*(\w+)'),
             'update_time':
             course.css('div[class="f6 text-gray mt-2"] ::attr(datetime)').
             extract_first()
         })
         github_url = 'https://github.com/' + response.xpath(
             '//h3/a/@href').extract()[i]
         i += 1
         request = scrapy.Request(github_url, callback=self.parse_more)
         request.meta['item'] = item
         yield request
Ejemplo n.º 24
0
    def parse(self, response):
        repos = response.xpath('//li[@itemprop="owns"]')
        for repo in repos:
            item = ShiyanlouItem()
            item['repo_name'] = repo.xpath('.//div/h3/a/text()').re_first(
                '\S+')
            item['update_time'] = repo.xpath(
                './/div[3]/relative-time/@datetime').re_first('\S+')
            yield item

        if response.xpath(
                './/div[@class="pagination"]/span[@class="disabled"]/text()'
        ).re_first('\S+') == 'Previous':
            url = response.xpath(
                './/div[@class="pagination"]/a/@href').extract_first()
        elif response.xpath(
                './/div[@class="pagination"]/span[@class="disabled"]/text()'
        ).re_first('\S+') == 'Next':
            sys.exit()
        else:
            url = response.xpath(
                './/div[@class="pagination"]/a[2]/@href').extract_first()

        yield Request(url, callback=self.parse)
Ejemplo n.º 25
0
    def parse(self, response):
        #repositories = response.xpath('//ul[@class="col-12 d-flex width-full py-4 border-bottom public source"]')
        """
        for repository in repositories:
            item = ShiyanlouItem()
            item['repo_name'] = repository.xpath('//div[@class="d-inline-block mb-1"]/h3/a/text()').extract_first()
            item['update_time'] = repository.xpath('//relative-time/@datetime').extract_first()
        """
        #repo_names = repositories[0].xpath('//div[@class="d-inline-block mb-1"]/h3/a/text()').re('(.+)')
        #update_times = repositories[0].xpath('//relative-time/@datetime').extract()
        #for i in range(len(repo_names)):
        #    item = ShiyanlouItem()
        #    item['repo_name'] = repo_names[i]
        #    item['update_time'] = update_times[i]
        #   yield item
        repos = response.xpath('//li[@itemprop="owns"]')
        for repo in repos:
            item = ShiyanlouItem()
            item['repo_name'] = repo.xpath(
                "./div/div/h3/a/text()").extract_first().strip()
            item['update_time'] = repo.xpath(
                "./div/div/relative-time/@datetime").extract_first()

            yield item
Ejemplo n.º 26
0
 def parse(self, response):
     for r in response.css('li.col-12'):
         yield ShiyanlouItem({
             'name':r.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first("\n\s*(.*)"),
             'update_time':r.xpath('.//relative-time/@datetime').extract_first()
         })