Beispiel #1
0
	def getLastLikeOrDefault(self, likes: List[str], default: str) -> str:
		import scrape
		for like in likes:
			u = scrape.getLastUrlLike(like)
			if u is not None:
				return u
		return default
	def softScrape(self, chapter: FicChapter) -> str:
		fic = chapter.getFic()

		curl = self.constructUrl(fic.localId, chapter.chapterId, None)
		#util.logMessage(f'FictionPressAdapter.scrape: {curl}')
		url = scrape.getLastUrlLike(curl)
		delay: float = 5
		if url is None:
			url = curl

		data = str(skitter.softScrape(url)['raw'])

		if data is None:
			raise Exception('unable to scrape? FIXME')
		if (
			data.lower().find('chapter not found.') != -1
			and data.lower().find("id='storytext'") == -1
		):
			ts = scrape.getMostRecentScrapeTime(url)
			if ts is None:
				raise Exception('no most recent scrape time? FIXME')
			# if we last scraped more than half an hour ago rescrape
			if int(time.time()) - ts > (60 * 30):
				url = self.constructUrl(fic.localId, chapter.chapterId, None)
				data = self.scrape(url)['raw']
		if data is None:
			raise Exception('unable to scrape? FIXME')

		if (
			data.lower().find('chapter not found.') != -1
			and data.lower().find("id='storytext'") == -1
		):
			raise Exception('unable to find chapter content {}'.format(url))

		return data
Beispiel #3
0
	def softScrapeUrl(self, origUrl: str) -> Optional[str]:
		url = origUrl
		lurl = scrape.getLastUrlLike(url)
		if lurl is not None:
			url = lurl

		data = skitter.softScrape(url, fallback=True)
		if 'raw' in data:
			return str(data['raw'])
		return None
Beispiel #4
0
	def softScrape(self, url: str) -> ScrapeMeta:
		url = canonizeUrl(url)
		# check if we already have it in our db, return it if we do
		tmpUrl = getLastUrlLike(url)
		if tmpUrl is not None:
			res = getMostRecentScrapeWithMeta(url)
			assert (res is not None)
			return res

		# otherwise call upstream .softCrawl
		apiUrl = urllib.parse.urljoin(self.baseUrl, 'v0/softCrawl')
		res = self._makeRequest(apiUrl, {'q': url})
		if res is None:
			raise Exception(f'SkitterClient.crawl: failed to crawl: {url}')
		saveWebRequest(res['fetched'], res['url'], res['status'], res['raw'])
		return res
Beispiel #5
0
    def staleScrape(self, url: str) -> Optional[ScrapeMeta]:
        url = canonizeUrl(url)
        # check if we already have it in our db, return it if we do
        tmpUrl = getLastUrlLike(url)
        if tmpUrl is not None:
            res = getMostRecentScrapeWithMeta(url)
            assert (res is not None)
            return res

        # check if it's in .cache
        res = self.cache(url)
        if res is not None:
            saveWebRequest(res['fetched'], res['url'], res['status'],
                           res['raw'])
            return res

        return None
Beispiel #6
0
	def softScrape(self, chapter: FicChapter) -> str:
		if chapter.url is None:
			chapter.url = self.buildUrl(chapter)  # type: ignore
			chapter.localChapterId = str(chapter.chapterId)
			chapter.upsert()
		fic = chapter.getFic()

		# TODO should we be passing '%' instead of chapter.fic.title ?
		#url = scrape.getLastUrlLikeOrDefault(
		#		(self.constructUrl(fic.localId, chapter.chapterId, None),
		#		self.constructUrl(fic.localId, chapter.chapterId, fic.title)))
		curl = self.constructUrl(fic.localId, chapter.chapterId, None)
		#util.logMessage(f'FFNAdapter.scrape: {curl}')
		url = scrape.getLastUrlLike(curl)
		if url is None:
			url = curl

		data = str(skitter.softScrape(url)['raw'])

		if data is None:
			raise Exception('unable to scrape? FIXME')
		if (
			data.lower().find('chapter not found.') != -1
			and data.lower().find("id='storytext'") == -1
		):
			ts = scrape.getMostRecentScrapeTime(url)
			if ts is None:
				raise Exception('no most recent scrape time? FIXME')
			# if we last scraped more than half an hour ago rescrape
			if int(time.time()) - ts > (60 * 30):
				url = self.constructUrl(fic.localId, chapter.chapterId, None)
				data = self.scrape(url)['raw']
		if data is None:
			raise Exception('unable to scrape? FIXME')

		if (
			data.lower().find('chapter not found.') != -1
			and data.lower().find("id='storytext'") == -1
		):
			raise Exception('unable to find chapter content {}'.format(url))

		return data