def parse_tags(self, response): item = response.meta['item'] l = MbedLibLoader(item=item, response=response) #TODO: formulate xpath to extract all tags (keywords) l.add_xpath('keywords', '/html/body/div[4]/div[1]/div/a[2]/@href') item = l.load_item() return item
def parse_dependencies(self, response): item = response.meta['item'] l = MbedLibLoader(item=item, response=response) l.add_xpath('dependencies', './/*[@id="mbed-content"]//div/div[2]/div[2]/div[1]/b/a/@href') item = l.load_item() #TODO: generate requests for all dependents; ideally emit them before proceeding with examples if 'dependencies' in item: for url in item['dependencies']: if url[0] == '/': url = 'http://developer.mbed.org'+url if not url in self.seen_urls: yield scrapy.Request(url,callback=self.parse) request = scrapy.Request(response.meta['libpage']+"dependents",callback=self.parse_examples) request.meta['libpage'] = response.meta['libpage'] request.meta['item'] = item yield request
def parse_project(self, response): self.seen_urls.append(response.url) l = MbedLibLoader(item=MbedLibItem(), response=response) l.add_xpath( 'repo_type', '/html/body/div[4]/div[2]/div[2]/table/tr[1]/td/text()[2]') l.add_xpath('owner', '/html/body/div[4]/div[1]/div/a[1]/text()[2]') l.add_xpath('ownerurl', '/html/body/div[4]/div[1]/div/a[1]/@href') l.add_xpath('name', '/html/body/div[4]/div[1]/div/a[2]/text()[2]') l.add_xpath('repository', '/html/body/div[4]/div[1]/div/a[2]/@href') l.add_xpath('description', './/*[@id="mbed-content"]/p[1]/text()') l.add_value('frameworks', 'mbed') l.add_value('platforms', mbed_platforms()) l.add_xpath('components', '/html/body/div[4]/div[2]/div[3]//a/@href') item = l.load_item() request = scrapy.Request(response.url + "dependencies", callback=self.parse_dependencies) request.meta['libpage'] = response.url request.meta['item'] = item return request
def parse_project(self, response): self.seen_urls.append(response.url) l = MbedLibLoader(item=MbedLibItem(), response=response) l.add_xpath('repo_type', '/html/body/div[4]/div[2]/div[2]/table/tr[1]/td/text()[2]') l.add_xpath('owner', '/html/body/div[4]/div[1]/div/a[1]/text()[2]') l.add_xpath('ownerurl', '/html/body/div[4]/div[1]/div/a[1]/@href') l.add_xpath('name', '/html/body/div[4]/div[1]/div/a[2]/text()[2]') l.add_xpath('repository', '/html/body/div[4]/div[1]/div/a[2]/@href') l.add_xpath('description', './/*[@id="mbed-content"]/p[1]/text()') l.add_value('frameworks', 'mbed') l.add_value('platforms', mbed_platforms()) l.add_xpath('components', '/html/body/div[4]/div[2]/div[3]//a/@href') item = l.load_item() request = scrapy.Request(response.url+"dependencies",callback=self.parse_dependencies) request.meta['libpage'] = response.url request.meta['item'] = item return request
def parse_examples(self, response): item = response.meta['item'] l = MbedLibLoader(item=item, response=response) l.add_xpath('examples', './/*[@id="mbed-content"]//div/div[2]/div[2]/div[1]/b/a/@href') item = l.load_item() return item
def parse(self, response): self.seen_urls.append(response.url) l = MbedLibLoader(item=MbedLibItem(), response=response) l.add_xpath('repo_type', '/html/body/div[4]/div[2]/div[2]/table/tr[1]/td/text()[2]') l.add_xpath('owner', '/html/body/div[4]/div[1]/div/a[1]/text()[2]') l.add_xpath('name', '/html/body/div[4]/div[1]/div/a[2]/text()[2]') l.add_xpath('repository', '/html/body/div[4]/div[1]/div/a[2]/@href') l.add_xpath('description', './/*[@id="mbed-content"]/p[1]/text()') # may need some cleaning up \n l.add_value('frameworks', 'mbed') l.add_value('platforms', ['freescalekinetis', 'nordicnrf51', 'nxplpc', 'ststm32', 'teensy']) l.add_xpath('components', '/html/body/div[4]/div[2]/div[3]//a/@href') item = l.load_item() request = scrapy.Request(response.url+"dependencies",callback=self.parse_dependencies) request.meta['libpage'] = response.url request.meta['item'] = item return request