Beispiel #1
0
    def scrapeLike(self, url: str, delay: int = None) -> str:
        url = scrape.canonizeUrl(url)
        if delay is None:
            delay = self.defaultDelay
        prefix = self.baseUrl + 'threads/'
        if not url.startswith(prefix):
            data = scrape.softScrape(url,
                                     delay,
                                     mustyThreshold=self.mustyThreshold)
            if data is None:
                raise Exception('unable to soft scrape? FIXME')
            return data

        ulike = url[len(prefix):]
        parts = ulike.split('/')
        parts[0] = parts[0].split('.')[-1]
        canon = prefix + '/'.join(parts)
        parts[0] = '%.' + parts[0]
        ulike = prefix + '/'.join(parts)

        # FIXME canon may find an older url than ulike :/

        canonRes = scrape.getMostRecentScrapeWithMeta(canon)
        if (canonRes is not None and
                int(time.time()) - self.mustyThreshold < canonRes['fetched']):
            return cast(str, canonRes['raw'])

        data = scrape.softScrape(url,
                                 delay,
                                 ulike,
                                 mustyThreshold=self.mustyThreshold)
        if data is None:
            raise Exception('unable to soft scrape? FIXME')
        return data
Beispiel #2
0
 def canonizeUrl(self, url: str) -> str:
     url = scrape.canonizeUrl(url)
     prefixMap = [
         ('http://', 'https://'),
         ('https://www.', 'https://'),
     ]
     for pm in prefixMap:
         if url.startswith(pm[0]):
             url = pm[1] + url[len(pm[0]):]
     return url
Beispiel #3
0
    def scrape(self, url: str) -> ScrapeMeta:
        url = canonizeUrl(url)
        # TODO staleOnly?
        if self.staleOnly:
            util.logMessage('staleScrape|{}'.format(url), 'scrape.log')

            #r = getMostRecentScrapeWithMeta(url, beforeId = _staleBefore)
            #if r is None or 'raw' not in r:
            #	raise Exception('failed to stale scrape url: {}'.format(url))
            #return { 'url': url, 'fetched': ts, 'raw': r['raw'] }

        res = self.crawl(url)
        saveWebRequest(res['fetched'], res['url'], res['status'], res['raw'])
        return res
Beispiel #4
0
	def softScrape(self, url: str) -> ScrapeMeta:
		url = canonizeUrl(url)
		# check if we already have it in our db, return it if we do
		tmpUrl = getLastUrlLike(url)
		if tmpUrl is not None:
			res = getMostRecentScrapeWithMeta(url)
			assert (res is not None)
			return res

		# otherwise call upstream .softCrawl
		apiUrl = urllib.parse.urljoin(self.baseUrl, 'v0/softCrawl')
		res = self._makeRequest(apiUrl, {'q': url})
		if res is None:
			raise Exception(f'SkitterClient.crawl: failed to crawl: {url}')
		saveWebRequest(res['fetched'], res['url'], res['status'], res['raw'])
		return res
Beispiel #5
0
    def staleScrape(self, url: str) -> Optional[ScrapeMeta]:
        url = canonizeUrl(url)
        # check if we already have it in our db, return it if we do
        tmpUrl = getLastUrlLike(url)
        if tmpUrl is not None:
            res = getMostRecentScrapeWithMeta(url)
            assert (res is not None)
            return res

        # check if it's in .cache
        res = self.cache(url)
        if res is not None:
            saveWebRequest(res['fetched'], res['url'], res['status'],
                           res['raw'])
            return res

        return None
Beispiel #6
0
#!/usr/bin/env python
import scrape
import sys
import time
import urllib.parse
from typing import Optional
from bs4 import BeautifulSoup  # type: ignore

archive = sys.argv[1]
url = 'http://{}.adult-fanfiction.org/search.php'.format(archive)
url += '?auth=&title=&summary=&tags=%2BCOMPLETE+-MM&cats=0&search=Search'
url += '&page={}'
url = scrape.canonizeUrl(url)


def fetch(url: str,
          pageNo: int,
          delay: int,
          force: bool = False) -> Optional[str]:
    url = url.format(pageNo)
    print(url)

    mostRecent = scrape.getMostRecentScrape(url)
    if mostRecent is not None and not force:
        print('url has already been scraped: {}'.format(url))
        return None

    res = scrape.scrape(url)
    print(res['fetched'])
    print(len(res['raw']))
Beispiel #7
0
            return r
    raise Exception(f'skitter.softScrape: unable to softScrape: {url}')


if __name__ == '__main__':
    import sys
    from skitter_client import SkitterClient
    from scrape import canonizeUrl, saveWebRequest

    skitter_primary: SkitterClient = priv.skitterClients[0]
    skitter_secondary: SkitterClient = priv.skitterClients[-1]

    if sys.argv[1] == 'recache':
        for line in sys.stdin.readlines():
            line = line.strip()
            url = canonizeUrl(line)
            print(url)
            # we want the newest version of non-1 chapters, otherwise the oldest
            # (so we skip now-deleted info requests for chap 1)
            res = skitter_secondary.cache(url, rev=url.endswith('/1'))
            if res is not None:
                saveWebRequest(res['fetched'], res['url'], res['status'],
                               res['raw'])
            else:
                print('  FAILED')
    elif sys.argv[1] == 'rescrape':
        print('rescrape')
        for line in sys.stdin.readlines():
            line = line.strip()
            url = canonizeUrl(line)
            print(url)