def parse(self, response): # Parses each scraped section into its own Meilisearch document. hxs = Selector(response) if response.url in self.seen: self.log('already seen %s' % response.url) else: self.log('parsing %s' % response.url) self.seen.add(response.url) for url in hxs.xpath('//a/@href').extract(): url = urljoin(response.url, url) if not url in self.seen and not re.search(r'.(pdf|zip|png|gif|jpeg)$', url) and url.startswith( self.start_urls[0] ): self.log("yielding request " + url) yield Request(url, callback=self.parse) item = DocsItem() item['site'] = self.name if hxs.xpath('//title/text()').get() != None: item['title'] = hxs.xpath('//title/text()').get().split(" - ")[0] if len(hxs.xpath('//title/text()').get().split(" - ")) >= 2: item['section'] = hxs.xpath('//title/text()').get().split(" - ")[1] if len(hxs.xpath('//title/text()').get().split(" - ")) == 1: item['section'] = hxs.xpath('//title/text()').get().split(" - ")[0] else: item['title'] = "Platform.sh Community" item['section'] = "Platform.sh Community" item['url'] = response.url # Every document in Meilisearch needs a unique documentID. item['documentId'] = hashlib.sha1(str(response.url).encode('utf-8')).hexdigest() item['text'] = re.sub(r'<.*?>', '', ' '.join(hxs.css('.crawler').extract())) item['rank'] = self.rank item['subsections'] = item['section'] yield item
def parse(self, response): hxs = Selector(response) if response.url in self.seen: self.log('already seen %s' % response.url) else: self.log('parsing %s' % response.url) self.seen.add(response.url) for url in hxs.xpath('//a/@href').extract(): url = urljoin(response.url, url) if not url in self.seen and not re.search(r'.(pdf|zip)$', url) and url.startswith( self.start_urls[0] ): self.log("yielding request " + url) yield Request(url, callback=self.parse) item = DocsItem() item['site'] = self.name if len(response.url.split("/")) >= 5: item['section'] = self.get_section(response.url.split("/")[3], response.url.split("/")[4]) else: item['section'] = self.default_section if item['section'] == "Demos": item['title'] = hxs.xpath('.//h1/text()').get() else: item['title'] = hxs.xpath('//title/text()').get().replace('| Platform.sh', '') item['url'] = response.url item['documentId'] = hashlib.sha1(str(response.url).encode('utf-8')).hexdigest() item['text'] = " ".join(response.xpath('.//div[contains(@class,"container-fluid") and not(contains(@class,"footer"))]//text()').getall()) item['rank'] = self.rank item['subsections'] = item['section'] yield item
def pares_data(self, response): item = DocsItem() item['length'] = str(len(requests.get(url=response.url).content)) item['url'] = response.url item['description'] = response.xpath( '//section[@class="section"]/h1/text()').extract()[0] item['title'] = response.xpath('//title/text()').extract()[0] item['texts'] = html2text.html2text( response.xpath('//section[@class="section"]').extract()[0]) item['data'] = html2text.html2text( response.xpath('//section[@class="section"]').extract()[0]) if re.findall('\*\*此内容.*?\*\*', item['data']): item['data'] = re.split('\*\*此内容.*?\*\*\s', item['data'])[1] if re.findall('>', item['data']): item['data'] = re.sub('>\s', "", item['data']) item['data'] = re.sub('>', "", item['data']) if re.findall('<', item['data']): item['data'] = re.sub('<\s', "", item['data']) item['data'] = re.sub('<', "", item['data']) if re.findall('^#{1}?', item['data']): item['data'] = re.sub('^#{1}? \w+.*', "", item['data']) if re.findall('\s#{1}?', item['data']): item['data'] = re.sub('\s#{1}? \w+.*', "", item['data']) if re.findall('##', item['data']): item['data'] = re.sub('##\s', "", item['data']) if re.findall('#', item['data']): item['data'] = re.sub('#', "", item['data']) if re.findall('\*\*', item['data']): item['data'] = re.sub('\*\*\s', "", item['data']) item['data'] = re.sub('\*\*', "", item['data']) if re.findall('\*', item['data']): item['data'] = re.sub('\*', "", item['data']) item['data'] = re.sub('\*\s', "", item['data']) if re.findall('预计阅读时间: .* 分钟.*', item['data']): item['data'] = re.sub('.* 分钟.*\s', "", item['data']) if re.findall('^#{2,}', item['data']): item['data'] = re.sub('^#{2,} \w*', "", item['data']) if re.findall('\[.*\]\(.*\)', item['data']): item['data'] = re.sub('\[.*\]\(.*\)\s', "", item['data']) item['data'] = re.sub('\[.*\]\(.*\)', "", item['data']) if re.findall('\n', item['data']): item['data'] = re.sub('\n', "", item['data']) item['data'] = item['data'][:150] + "..." return item
def parse(self, response): sections = response.xpath('.//div[contains(@class,"api-content")]/div') for index, section in enumerate(sections): title = section.xpath('.//h2/text()').get() url = section.xpath('.//h2/a/@href').get() text = section.xpath('.//p/text()').getall() if title is not None and url is not None and text is not None: endpoint_section = self.get_section(url) if endpoint_section is not None: item = DocsItem() item['site'] = self.name item['title'] = title item['url'] = self.start_urls[0] + url item['section'] = endpoint_section item['documentId'] = hashlib.sha1( str(item['url']).encode('utf-8')).hexdigest() item['text'] = " ".join(text) item['rank'] = self.rank item['subsections'] = endpoint_section yield item