def scrapeLike(self, url: str, delay: int = None) -> str: url = scrape.canonizeUrl(url) if delay is None: delay = self.defaultDelay prefix = self.baseUrl + 'threads/' if not url.startswith(prefix): data = scrape.softScrape(url, delay, mustyThreshold=self.mustyThreshold) if data is None: raise Exception('unable to soft scrape? FIXME') return data ulike = url[len(prefix):] parts = ulike.split('/') parts[0] = parts[0].split('.')[-1] canon = prefix + '/'.join(parts) parts[0] = '%.' + parts[0] ulike = prefix + '/'.join(parts) # FIXME canon may find an older url than ulike :/ canonRes = scrape.getMostRecentScrapeWithMeta(canon) if (canonRes is not None and int(time.time()) - self.mustyThreshold < canonRes['fetched']): return cast(str, canonRes['raw']) data = scrape.softScrape(url, delay, ulike, mustyThreshold=self.mustyThreshold) if data is None: raise Exception('unable to soft scrape? FIXME') return data
def canonizeUrl(self, url: str) -> str: url = scrape.canonizeUrl(url) prefixMap = [ ('http://', 'https://'), ('https://www.', 'https://'), ] for pm in prefixMap: if url.startswith(pm[0]): url = pm[1] + url[len(pm[0]):] return url
def scrape(self, url: str) -> ScrapeMeta: url = canonizeUrl(url) # TODO staleOnly? if self.staleOnly: util.logMessage('staleScrape|{}'.format(url), 'scrape.log') #r = getMostRecentScrapeWithMeta(url, beforeId = _staleBefore) #if r is None or 'raw' not in r: # raise Exception('failed to stale scrape url: {}'.format(url)) #return { 'url': url, 'fetched': ts, 'raw': r['raw'] } res = self.crawl(url) saveWebRequest(res['fetched'], res['url'], res['status'], res['raw']) return res
def softScrape(self, url: str) -> ScrapeMeta: url = canonizeUrl(url) # check if we already have it in our db, return it if we do tmpUrl = getLastUrlLike(url) if tmpUrl is not None: res = getMostRecentScrapeWithMeta(url) assert (res is not None) return res # otherwise call upstream .softCrawl apiUrl = urllib.parse.urljoin(self.baseUrl, 'v0/softCrawl') res = self._makeRequest(apiUrl, {'q': url}) if res is None: raise Exception(f'SkitterClient.crawl: failed to crawl: {url}') saveWebRequest(res['fetched'], res['url'], res['status'], res['raw']) return res
def staleScrape(self, url: str) -> Optional[ScrapeMeta]: url = canonizeUrl(url) # check if we already have it in our db, return it if we do tmpUrl = getLastUrlLike(url) if tmpUrl is not None: res = getMostRecentScrapeWithMeta(url) assert (res is not None) return res # check if it's in .cache res = self.cache(url) if res is not None: saveWebRequest(res['fetched'], res['url'], res['status'], res['raw']) return res return None
#!/usr/bin/env python import scrape import sys import time import urllib.parse from typing import Optional from bs4 import BeautifulSoup # type: ignore archive = sys.argv[1] url = 'http://{}.adult-fanfiction.org/search.php'.format(archive) url += '?auth=&title=&summary=&tags=%2BCOMPLETE+-MM&cats=0&search=Search' url += '&page={}' url = scrape.canonizeUrl(url) def fetch(url: str, pageNo: int, delay: int, force: bool = False) -> Optional[str]: url = url.format(pageNo) print(url) mostRecent = scrape.getMostRecentScrape(url) if mostRecent is not None and not force: print('url has already been scraped: {}'.format(url)) return None res = scrape.scrape(url) print(res['fetched']) print(len(res['raw']))
return r raise Exception(f'skitter.softScrape: unable to softScrape: {url}') if __name__ == '__main__': import sys from skitter_client import SkitterClient from scrape import canonizeUrl, saveWebRequest skitter_primary: SkitterClient = priv.skitterClients[0] skitter_secondary: SkitterClient = priv.skitterClients[-1] if sys.argv[1] == 'recache': for line in sys.stdin.readlines(): line = line.strip() url = canonizeUrl(line) print(url) # we want the newest version of non-1 chapters, otherwise the oldest # (so we skip now-deleted info requests for chap 1) res = skitter_secondary.cache(url, rev=url.endswith('/1')) if res is not None: saveWebRequest(res['fetched'], res['url'], res['status'], res['raw']) else: print(' FAILED') elif sys.argv[1] == 'rescrape': print('rescrape') for line in sys.stdin.readlines(): line = line.strip() url = canonizeUrl(line) print(url)