def parse(self,response): for repository in response.css('li.public'): item = ShiyanlougithubItem({ 'name':repositories.css('a[itemprop="name codeRepository"]::text').extract_first().strip(), 'update_time':repositories.css('relative-time::attr(datetime)').extract_first() }) yield item
def parse(self, response): for repository in response.css('li.public'): item = ShiyanlougithubItem({ 'name': repository.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first(r'\n\s*(.*)'), 'update_time': repository.xpath('.//relative-time/@datetime').extract_first() }) yield item
def parse(self, response): for course in response.css('li[class="col-12 d-block width-full py-4 border-bottom public source"]'): item=ShiyanlougithubItem({ "name":course.css("h3 a::text").re_first('[^\S]*(\S+)[^\S]*'), "update_time":course.css("div.f6.text-gray.mt-2 relative-time::attr(datetime)").extract_first() }) print(item) yield item
def parse(self, response): for github in response.xpath('.//li[contains(@itemprop,"owns")]'): item = ShiyanlougithubItem() item['name'] = github.xpath('.//a[contains(@itemprop,"name codeRepository")]/text()[1]').extract_first().strip(), item['update_time'] = github.xpath('.//div[@class="f6 text-gray mt-2"]/relative-time/@datetime').extract_first() github_url = response.urljoin(github.xpath('.//div[@class="d-inline-block mb-1"]/h3/a/@href').extract_first()) request = scrapy.Request(github_url, callback=self.parse_detail) request.meta['item'] = item yield request
def parse(self, response): for repos in response.css('li.py-4'): yield ShiyanlougithubItem({ 'name': repos.xpath('.//a[contains(@itemprop,"codeRe")]/text()'). re_first('\s*(.+)'), 'update_time': repos.xpath('.//relative-time/@datetime').extract_first() })
def parse(self, response): for rep in response.css('li.col-12'): item = ShiyanlougithubItem({ 'name': rep.xpath('.//h3/a/text()').extract_first().strip(), 'update_time': rep.xpath('.//relative-time/@datetime').re_first('(.+)Z') }) yield item
def parse(self, response): for repository in response.css('div#user-repositories-list ul li'): item = ShiyanlougithubItem({ 'name': repository.xpath('.//div/h3/a/text()').extract_first().strip(), 'update_time': repository.xpath( './/div/relative-time/@datetime').extract_first() }) yield item
def parse(self, response): for repo in response.css('li.col-12'): item = ShiyanlougithubItem() item['name']= repo.css('li.col-12 div.d-inline-block a::text').re_first("\n\s* (.+)"), item['update_time']= repo.css('li.col-12 div.f6 relative-time::attr(datetime)').extract_first() repository_url = response.urljoin(repo.css('li.col-12 div.d-inline-block a::attr(href)').extract_first()) request = scrapy.Request(repository_url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): for course in response.xpath('//div[@class="d-inline-block mb-1"]'): yield ShiyanlougithubItem({ 'name': course.xpath('.//h3/a/text()').re_first('^\s*(.*)'), 'update_time': course.xpath( '..//div[@class="f6 text-gray mt-2"]/relative-time'). re_first('<relative-time datetime="(.*)"') })
def parse(self, response): for repository in response.css('li.public'): item = ShiyanlougithubItem() item['name'] = repository.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first("\n\s*(.*)") item['update_time'] = repository.xpath('.//relative-time/@datetime').extract_first() item['description'] = response.xpath('.//p[@itemprop="description"]/text()').re_first('\n\s*(.*)\s') repo_url = response.urljoin(repository.xpath('.//a/@href').extract_first()) request = scrapy.Request(repo_url, callback=self.parse_repo) # put the data which we got to meta temporary and make a request request.meta['item'] = item yield request
def parse(self, response): for repos in response.css('li.col-12'): item = ShiyanlougithubItem({ "name": repos.xpath( './/a[contains(@itemprop, "name codeRepository")]/text()'). re_first('[^\w]*(\w*)'), "update_time": repos.xpath('.//relative-time/@datetime').extract_first() }) yield item
def parse(self, response): for repo in response.css('li.col-12'): item = ShiyanlougithubItem({ 'name': repo.css('li.col-12 div.d-inline-block a::text').re_first( "\n\s* (.+)"), 'update_time': repo.css('li.col-12 div.f6 relative-time::attr(datetime)'). extract_first() }) yield item
def parse(self, response): for rep in response.css('li.col-12'): item = ShiyanlougithubItem() item['name'] = rep.xpath('.//h3/a/text()').extract_first().strip() item['update_time'] = rep.xpath( './/relative-time/@datetime').re_first('(.+)Z') course_url = response.urljoin(rep.xpath('@href').extract_first()) request = scrapy.Request(url=course_url, callback=self.parse_course) request.meta['item'] = item yield request
def parse(self, response): for repo in response.xpath("//ul[contains(@data-filterable-type, 'substring')]/li"): # 使用 xpath 语法对每个 course 提取数据 item = ShiyanlougithubItem({ "name": repo.xpath(".//div[1]/h3/a/text()").re_first('[\n ]*([\d\w]*)[\n ]*'), "update_time": repo.xpath(".//div[3]/relative-time/@datetime").extract_first() }) repo_url = response.urljoin(repository.xpath('.//a/@href').extract_first()) request = scrapy.Request(repo_url, callback=self.parse_repo) request.meta['item'] = item yield request
def parse(self, response): print(response) for gitlou in response.css('div#user-repositories-list ul li'): item = ShiyanlougithubItem({ 'name':gitlou.css('a::text').extract_first(), 'update_time':gitlou.css('relative-time::attr(datetime)').extract_first() }) url = response.urljoin(gitlou.css('a::attr(href)').extract_first()) print('url',url) request = scrapy.Request(url,callback = self.parse_detail) request.meta['item'] = item yield request
def parse(self, response): for course in response.css('li.public'): item=ShiyanlougithubItem() item["name"]=course.css("h3 a::text").re_first('[^\S]*(\S+)[^\S]*') item["update_time"]=course.css("div.f6.text-gray.mt-2 relative-time::attr(datetime)").extract_first() url=response.urljoin(course.css("a::attr(href)").extract_first()) print('#'*20+url) #print(item) request=scrapy.Request(url,callback=self.parse_pro,dont_filter=True) request.meta['item']=item yield request
def parse(self, response): for repo in response.xpath( "//ul[contains(@data-filterable-type, 'substring')]/li"): # 使用 xpath 语法对每个 course 提取数据 yield ShiyanlougithubItem({ "name": repo.xpath(".//div[1]/h3/a/text()").re_first( '[\n ]*([\d\w]*)[\n ]*'), "update_time": repo.xpath( ".//div[3]/relative-time/@datetime").extract_first() })
def parse(self, response): for repository in response.css('li.public'): item = ShiyanlougithubItem() item['name'] = repository.xpath( './/a[@itemprop="name codeRepository"]/text()').re_first( "\n\s*(.*)") item['update_time'] = repository.xpath( './/relative-time/@datetime').extract_first() new_url = response.urljoin( repository.xpath('.//a/@href').extract_first()) request = scrapy.Request(new_url, callback=self.parse_new) request.meta['item'] = item yield request
def parse(self, response): for repository in response.css("li.public"): item = ShiyanlougithubItem({ "name": repository.xpath(".//a[@itemprop='name codeRepository']/text()" ).re_first("\n +(.+)"), "update_time": repository.xpath( ".//relative-time//@datetime").extract_first(), }) repo_url = response.urljoin( repository.xpath(".//a/@href").extract_first()) request = scrapy.Request(repo_url, callback=self.parse_repo) request.meta['item'] = item yield request
def parse(self, response): for course in response.xpath('//div[@class="d-inline-block mb-1"]'): item = ShiyanlougithubItem() # yield ShiyanlougithubItem({ item['name'] = course.xpath('.//h3/a/text()').re_first('^\s*(.*)') item['update_time'] = course.xpath( '..//div[@class="f6 text-gray mt-2"]/relative-time').re_first( '<relative-time datetime="(.*)"') # }) reo_url = response.urljoin( course.xpath('.//h3/a/@href').extract_first()) request = scrapy.Request(reo_url, callback=self.parse_cbr) request.meta['item'] = item yield request
def parse(self, response): for repos in response.css('li.col-12'): item = ShiyanlougithubItem() item['name'] = repos.xpath( './/a[contains(@itemprop, "name codeRepository")]/text()' ).re_first('[^\w]*(\w*)') item['update_time'] = repos.xpath( './/relative-time/@datetime').extract_first() repos_url = response.urljoin( repos.xpath( './/a[contains(@itemprop, "name codeRepository")]/@href'). extract_first()) request = scrapy.Request(url=repos_url, callback=self.parse_repos) request.meta['item'] = item yield request
def parse(self, response): for repository in response.css('div#user-repositories-list > ul > li'): item = ShiyanlougithubItem() item['name'] = repository.css('div.d-inline-block.mb-1 \ h3 a::text').re_first(r'[^\S]*(.+)[^\S]*') item['update_time'] = repository.css('div.f6.text-gray.mt-2 \ relative-time::attr(datetime)').extract_first() repository_url = response.urljoin(repository.xpath('.//a/@href\ ').extract_first()) request = scrapy.Request(repository_url, callback=self.parse_repo) request.meta['item'] = item yield request
def parse(self, response): for item in response.xpath('//li[contains(@class, "width-full")]'): yield ShiyanlougithubItem({ 'name': item.xpath('.//div[contains(@class, "mb-1")]/h3/a/text()' ).re_first(r'\s+(\S+)'), 'update_time': item.xpath( './/div[contains(@class, "f6")]/relative-time/@datetime'). extract_first() }) nextpg = response.xpath('//div[@class="pagination"]/a') for page in nextpg: if page.xpath('.//text()').extract_first() == 'Next': url = page.xpath('.//@href').extract_first() yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): for repo in response.css('div#user-repositories-list li'): name = repo.css('h3 a::text').re_first(' +(.*)') update_time = repo.css( 'relative-time::attr(datetime)').extract_first() yield ShiyanlougithubItem(name=name, update_time=update_time)
def parse(self, response): for repo in response.css('li.col-12'): yield ShiyanlougithubItem({ 'name': repo.css('div.d-inline-block a::text').extract_first().strip(), 'update_time': repo.css('div.f6 relative-time::attr(datetime)').extract_first() })