def scrape(source: WebSource, workerId: int, url: str, triesLeft: int = 3 ) -> Web: if triesLeft <= 0: raise Exception('scrape: exceeded retry count') created = int(time.time() * 1000) r = make_req(workerId, url) w = Web(created_=created, url_=url, status_=r.status, sourceId_=source.id) w.response = r.response w.requestHeaders = None # str(r.request.headers).encode('utf-8') w.responseHeaders = None # str(r.headers).encode('utf-8') if w.status == 200: dec = enc.decode(w.response, url) if dec is not None and dec[0] is not None: w.encoding = Encoding.lookup(oil.open(), dec[0]).id if dec is not None and dec[1] is not None: title = '' try: title = extractTitle(dec[1]).strip() except: pass if title == 'Just a moment...' \ or title == 'Attention Required! | Cloudflare': plog(f'scrape: got 200 status CF page, retrying: {triesLeft - 1}') time.sleep(9 + random.random() * 2) return scrape(source, workerId, url, triesLeft - 1) w.save(oil.open()) return w
def work(workerId: int, stripeCount: int, stripe: int, blockSize: int) -> int: node = get_node_name(workerId) while True: if maybe_restart_vpn(node): plog('work: restarted vpn; restarting fr') restart_worker_fr(node) if not get_fr_ok(workerId): plog('work: fr is not ok; attempting to restart fr') restart_worker_fr(node) if not get_fr_ok(workerId): plog('work: fr still not ok; aborting') return 2 if 'skitter' not in get_sessions_list(workerId): global defaultUserAgent plog('work: skitter not in sessions list; trying to create it...') create_session(workerId, 'skitter', defaultUserAgent) if 'skitter' not in get_sessions_list(workerId): plog('work: skitter session does not exist; aborting') return 3 remoteIP = get_worker_ip_fr(workerId) plog(f'work: {remoteIP=}') source = WebSource.lookup(oil.open(), f'{NODE_NAME}_{node}', remoteIP) plog(f'work: source: {source.__dict__}') if source.isLocal() or source.source is None \ or source.source.startswith(publicIp): plog('work: source is local; aborting') return 1 WebQueue.resetWorker(oil.open(), workerId) workBlock(workerId, stripeCount, stripe, blockSize, source)
def v0_crawl() -> ResponseReturnValue: apiKey = get_request_value('apiKey', '') if apiKey not in API_KEYS: return make_response({'err':-401,'msg':'unauthorized'}, 401) q = get_request_value('q', None) print(f'v0_crawl: {q=}') if (q is None or len(q.strip()) < 1): return page_not_found(NotFound()) if not q.startswith('http://') and not q.startswith('https://'): return page_not_found(NotFound()) ts = int(time.time()) - 1 db = oil.open() scraper = RemoteWebScraper(db) scraper.scrape(q) latest = Web.latest(db, q, status=200) if latest is None or latest.created is None: print(f'v0_crawl: {q=}: error: no latest entry') return make_response({'err':-500,'msg':'internal server error'}, 500) lts = int(latest.created//1000) if lts < ts: print(f'v0_crawl: {q=}: error getting fresh crawl: {ts} >= {lts}') return make_response({'err':-500,'msg':'internal server error'}, 500) return make_response_web(latest)
def v0_cache() -> ResponseReturnValue: apiKey = get_request_value('apiKey', '') if apiKey not in API_KEYS: return make_response({'err':-401,'msg':'unauthorized'}, 401) q = get_request_value('q', None) u = get_request_value('u', None) print(f'v0_cache: {q=}, {u=}') if (q is None or len(q.strip()) < 1) \ and (u is None or len(u.strip()) < 1): print(f'v0_cache: q and u are empty') return page_not_found(NotFound()) latest = None db = oil.open() if u: latest = Web.latest(db, ulike=u, status=200) else: latest = Web.latest(db, q, status=200) if latest is None or latest.response is None or latest.created is None: print(f'v0_cache: {q=}, {u=}: not found') else: print(f'v0_cache: {q=}, {u=}: found: len: {len(latest.response)}, url: {latest.url}, id: {latest.id}, created: {latest.created}') return make_response_web(latest)
def save(sourceId: int, authorId: int, reason: int) -> None: with oil.open() as db, db.cursor() as curs: curs.execute(''' insert into authorBlacklist(sourceId, authorId, reason) values(%s, %s, %s) on conflict(sourceId, authorId, reason) do update set updated = current_timestamp ''', (sourceId, authorId, reason))
def check(sourceId: int, authorId: int, reason: Optional[int] = None) -> bool: with oil.open() as db, db.cursor() as curs: curs.execute(''' select sourceId, authorId from authorBlacklist where sourceId = %s and authorId = %s and ((%s is null or reason = %s) or sourceId = 19) ''', (sourceId, authorId, reason, reason)) return len(curs.fetchall()) > 0
def save(urlId: str, reason: int) -> None: with oil.open() as db, db.cursor() as curs: curs.execute(''' insert into ficBlacklist(urlId, reason) values(%s, %s) on conflict(urlId, reason) do update set updated = current_timestamp ''', (urlId, reason))
def select(urlId: Optional[str] = None) -> List['FicBlacklist']: with oil.open() as db, db.cursor() as curs: curs.execute(''' select urlId, created, updated, reason from ficBlacklist where %s is null or urlId = %s ''', (urlId, urlId)) return [FicBlacklist(*r) for r in curs.fetchall()]
def select(urlId: Optional[str] = None) -> List['FicInfo']: with oil.open() as db, db.cursor() as curs: curs.execute(f''' select {FicInfo.selectList()} from ficInfo {FicInfo.tableAlias} where %s is null or id = %s ''', (urlId, urlId)) return [FicInfo(*r) for r in curs.fetchall()]
def select(isAutomated: bool, route: str, description: str ) -> Optional['RequestSource']: with oil.open() as db, db.cursor() as curs: curs.execute(''' select rs.id, rs.created, rs.isAutomated, rs.route, rs.description from requestSource rs where rs.isAutomated = %s and route = %s and description = %s ''', (isAutomated, route, description)) r = curs.fetchone() return None if r is None else RequestSource(*r)
def select(sourceId: Optional[int] = None, authorId: Optional[int] = None ) -> List['AuthorBlacklist']: with oil.open() as db, db.cursor() as curs: curs.execute(''' select sourceId, authorId, created, updated, reason from authorBlacklist where (%s is null or sourceId = %s) and (%s is null or authorId = %s) ''', (sourceId, sourceId, authorId, authorId)) return [AuthorBlacklist(*r) for r in curs.fetchall()]
def select(id_: int) -> 'RequestSource': with oil.open() as db, db.cursor() as curs: curs.execute( ''' select rs.id, rs.created, rs.isAutomated, rs.route, rs.description from requestSource rs where rs.id = %s ''', (id_, )) r = curs.fetchone() return None if r is None else RequestSource(*r)
def insert(source: RequestSource, etype: str, query: str, infoRequestMs: int, urlId: Optional[str], ficInfo: Optional[str], exportMs: Optional[int], exportFileName: Optional[str], exportFileHash: Optional[str], url: Optional[str]) -> None: with oil.open() as db, db.cursor() as curs: curs.execute(''' insert into requestLog(sourceId, etype, query, infoRequestMs, urlId, ficInfo, exportMs, exportFileName, exportFileHash, url) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ''', (source.id, etype, query, infoRequestMs, urlId, ficInfo, exportMs, exportFileName, exportFileHash, url))
def workBlock(workerId: int, stripeCount: int, stripe: int, cnt: int, source: WebSource) -> None: defaultTimeout = 600.0 idleTimeout = 0.25 timeout = defaultTimeout while cnt > 0 and timeout > 0: wq = WebQueue.next(oil.open(), workerId, 0, stripeCount=stripeCount, stripe=stripe) if wq is None: timeout -= idleTimeout time.sleep(idleTimeout) continue assert(wq.url is not None) plog(f'workBlock: {cnt}: {wq.url}') cnt -= 1 timeout = defaultTimeout w = scrape(source, workerId, wq.url) wq.dequeue(oil.open()) rlen = -1 if w.response is None else len(w.response) plog(f'workBlock: status {w.status}, {rlen} bytes') time.sleep(9 + random.random() * 2)
def lookup(urlId: str, version: int, etype: str, inputHash: str ) -> Optional['ExportLog']: with oil.open() as db, db.cursor() as curs: curs.execute(''' select * from exportLog e where e.urlId = %s and e.version = %s and e.etype = %s and e.inputHash = %s ''', (urlId, version, etype, inputHash)) r = curs.fetchone() return ExportLog(*r[:ExportLog.fieldCount]) if r is not None else None
def mostRecentByUrlId(etype: str, urlId: str) -> Optional['RequestLog']: with oil.open() as db, db.cursor() as curs: curs.execute(''' select r.id, r.created, r.sourceId, r.etype, r.query, r.infoRequestMs, r.urlId, r.ficInfo, r.exportMs, r.exportFileName, r.exportFileHash, r.url from requestLog r where r.etype = %s and r.urlId = %s order by r.created desc limit 1 ''', (etype, urlId)) r = curs.fetchone() return None if r is None else RequestLog(*r)
def v0_cache() -> ResponseReturnValue: remoteAddr = get_remote_addr() apiKey = get_request_value('apiKey', None) db = oil.open() limiter = get_limiter(db, remoteAddr, apiKey) retryAfterResponse = limiter.retryAfterResponse(db, .1) if retryAfterResponse is not None: return retryAfterResponse res = v0_cache_internal() if res is not None: return res return make_response({'err': -404, 'msg': 'not found'}, 404)
def v0_soft_crawl() -> ResponseReturnValue: remoteAddr = get_remote_addr() apiKey = get_request_value('apiKey', None) db = oil.open() limiter = get_limiter(db, remoteAddr, apiKey) retryAfterResponse = limiter.retryAfterResponse(db, .1) if retryAfterResponse is not None: return retryAfterResponse res = v0_cache_internal() if res is not None: return res return v0_crawl()
def mostRecentByUrlId(urlId): with oil.open() as db, db.cursor() as curs: curs.execute( ''' select r.id, r.created, r.sourceId, r.etype, r.query, r.infoRequestMs, r.urlId, r.ficInfo, r.exportMs, r.exportFileName, r.exportFileHash, r.url from requestLog r where urlId = %s order by created desc limit 1''', (urlId, )) r = curs.fetchone() if r is None: return None return RequestLog(*r)
def upsert(isAutomated: bool, route: str, description: str ) -> 'RequestSource': existing = RequestSource.select(isAutomated, route, description) if existing is not None: return existing with oil.open() as db, db.cursor() as curs: curs.execute(''' insert into requestSource(isAutomated, route, description) values (%s, %s, %s) on conflict(isAutomated, route, description) do nothing ''', (isAutomated, route, description)) src = RequestSource.select(isAutomated, route, description) if src is None: raise Exception(f'RequestSource.upsert: failed to upsert') return src
def upsert(self) -> 'ExportLog': with oil.open() as db, db.cursor() as curs: curs.execute(''' insert into exportLog(urlId, version, etype, inputHash, exportHash) values(%s, %s, %s, %s, %s) on conflict(urlId, version, etype, inputHash) do update set exportHash = EXCLUDED.exportHash where exportLog.created < EXCLUDED.created ''', (self.urlId, self.version, self.etype, self.inputHash, self.exportHash)) l = ExportLog.lookup(self.urlId, self.version, self.etype, self.inputHash) assert(l is not None) self.exportHash = l.exportHash self.created = l.created return self
def check(urlId: str, reason: Optional[int] = None) -> bool: with oil.open() as db, db.cursor() as curs: curs.execute(''' select 1 from ficInfo fi left join ficBlacklist fb on fb.urlId = fi.id and (%s is null or fb.reason = %s) left join authorBlacklist ab on ab.sourceId = fi.sourceId and ab.authorId = fi.authorId and (%s is null or ab.reason = %s) where fi.id = %s and ((fb.reason is not null or ab.reason is not null) or fi.sourceId = 19) ''', (reason, reason, reason, reason, urlId)) return len(curs.fetchall()) > 0
def fetchAfter(after): with oil.open() as db, db.cursor() as curs: curs.execute( ''' select r.id, r.created, r.sourceId, r.etype, r.query, r.infoRequestMs, r.urlId, r.ficInfo, r.exportMs, r.exportFileName, r.exportFileHash, r.url from requestLog r where id > %s and (r.exportFileHash is null or not exists ( select 1 from requestLog r2 where r2.exportFileHash = r.exportFileHash and r2.id < r.id )) ''', (after, )) ls = [RequestLog(*r) for r in curs.fetchall()] return ls return []
def v0_status() -> ResponseReturnValue: remoteAddr = get_remote_addr() apiKey = get_request_value('apiKey', None) db = oil.open() limiter = get_limiter(db, remoteAddr, apiKey) retryAfterResponse = limiter.retryAfterResponse(db, .1) if retryAfterResponse is not None: return retryAfterResponse limiter = limiter.refresh(db) return make_response({ 'err': 0, 'status': 'ok', 'pid': os.getpid(), 'tident': threading.get_ident(), 'burst': int(math.floor(limiter.burst())), 'flow': limiter.flow, 'anon': limiter.isAnon() })
def main(args: List[str]) -> int: global logFileName stripeCount = 1 stripe = 0 workerId = int(sys.argv[1]) logFileName = f'skitter_worker_{workerId}.log' global publicIp publicIp = get_public_ip() plog(f'main: public ip: {publicIp}') global blockSize run = True while run: doTryReset = False try: work(workerId, stripeCount, stripe, blockSize) except KeyboardInterrupt: run = False except Exception as e: plog(f'work: exception:\n{e}\n{traceback.format_exc()}') doTryReset = True try: WebQueue.resetWorker(oil.open(), workerId) except KeyboardInterrupt: run = False except: pass if doTryReset: plog('main: sleeping then reseting worker') time.sleep(30 + random.random() * 30) try_reset(workerId) if run: plog('main: sleeping then restarting work loop') time.sleep(60 + random.random() * 90) return 0
def save(ficInfo: Dict[str, str]) -> None: with oil.open() as db, db.cursor() as curs: fi = FicInfo.parse(ficInfo) curs.execute(''' insert into ficInfo( id, title, author, chapters, words, description, ficCreated, ficUpdated, status, source, extraMeta, sourceId, authorId, contentHash) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) on conflict(id) do update set updated = current_timestamp, title = EXCLUDED.title, author = EXCLUDED.author, chapters = EXCLUDED.chapters, words = EXCLUDED.words, description = EXCLUDED.description, ficCreated = EXCLUDED.ficCreated, ficUpdated = EXCLUDED.ficUpdated, status = EXCLUDED.status, source = EXCLUDED.source, extraMeta = EXCLUDED.extraMeta, sourceId = EXCLUDED.sourceId, authorId = EXCLUDED.authorId, contentHash = EXCLUDED.contentHash ''', (fi.id, fi.title, fi.author, fi.chapters, fi.words, fi.description, fi.ficCreated, fi.ficUpdated, fi.status, fi.source, fi.extraMeta, fi.sourceId, fi.authorId, fi.contentHash))
for w in Web.fetchIdRange_g(db, wid_s, wid_e, ulike=ffnLike, status=200): if w.response is None or len(w.response) < 1: continue assert (w.url is not None and w.created is not None) dec = enc.decode(w.response, w.url) if dec is None: continue html = dec[1] ts = int(w.created / 1000) html = f"<!--\t{ts}\t{w.url}\t-->\n" + html s = io.BytesIO(html.encode('utf-8')) ti = tarfile.TarInfo(name=f"./{w.id}.html") ti.mtime = int(w.created // 1000) ti.size = len(html.encode('utf-8')) xzf.addfile(tarinfo=ti, fileobj=s) return 0 if __name__ == '__main__': with oil.open() as db: res = main(db) sys.exit(res)
def v0_crawl_internal(prefixMunges: List[Tuple[str, str]], validPrefixes: List[str]) -> ResponseReturnValue: remoteAddr = get_remote_addr() apiKey = get_request_value('apiKey', None) q = get_request_value('q', None) if q is not None and len(q) > 4096: q = q[:4096] print(f'v0_crawl_internal: {q=}') if DISABLE_CRAWLING: print(f'v0_crawl_internal: temporarily disabled, 503') retryAfter = 300 res = make_response( { 'err': -503, 'msg': 'service unavailable', 'retryAfter': retryAfter }, 503) res.headers['Retry-After'] = retryAfter return res db = oil.open() limiter = get_limiter(db, remoteAddr, apiKey) WeaverRequestLog.log(db, limiter.id, q) if limiter.isAnon(): glimiter = WeaverLimiter.select(db, 'global_anon') if glimiter is None: return make_response({'err': -5, 'msg': 'no global limiter'}, 500) retryAfterResponse = glimiter.retryAfterResponse(db, 1) if retryAfterResponse is not None: return retryAfterResponse retryAfterResponse = limiter.retryAfterResponse(db, 1) if retryAfterResponse is not None: return retryAfterResponse if (q is None or len(q.strip()) < 1): return make_response({'err': -1, 'msg': 'missing q param'}, 400) for munge in prefixMunges: if q.startswith(munge[0]): q = munge[1] + q[len(munge[0]):] validPrefix = False for pre in validPrefixes: if q.startswith(pre): validPrefix = True break if not validPrefixes: return make_response({ 'err': -2, 'msg': 'url is invalid', 'arg': q }, 400) try: global defaultRequestTimeout, defaultUserAgent global skitterBaseUrl, skitterApiKey, skitterUser cookies: Dict[str, str] = {} headers = {'User-Agent': defaultUserAgent} url = urllib.parse.urljoin(skitterBaseUrl, 'v0/crawl') r = requests.get(url, headers=headers, cookies=cookies, params={'q': q}, data={'apiKey': skitterApiKey}, auth=skitterUser, timeout=defaultRequestTimeout) if r.status_code != 200: return make_response({ 'err': -3, 'msg': 'skitter error', 'arg': q }, 500) fres = make_response(r.content) for rh in r.headers: if rh.startswith('X-Weaver'): fres.headers[rh] = r.headers[rh] return fres except Exception as e: print( f'v0_crawl_internal: exception {q}: {e}\n{traceback.format_exc()}') return make_response({'err': -4, 'msg': 'no return', 'arg': q}, 500)
#!/usr/bin/env python import sys import traceback from typing import Set, Dict, Optional, Any from bs4 import BeautifulSoup # type: ignore import urllib.parse from oil import oil from weaver import RemoteWebScraper, WebQueue import weaver.enc as enc from minerva import FFNLanguage, FFNCategory, FFNGenre, FFNFandom, FFNCharacter db = oil.open() scraper = RemoteWebScraper(db) #scraper.baseDelay = 30 scraper.requestTimeout = 300 scraper.mustyThreshold = 60 * 60 * 24 * 30 * 1 def stripAfter(s: str, needle: str) -> str: idx = s.find(needle) if idx < 0: return s return s[:idx] baseUrl = 'https://www.fanfiction.net' baseCrossoverUrl = 'https://www.fanfiction.net/crossovers' def getCategories(scraper: RemoteWebScraper) -> Set[str]: categoryBlacklist = [
def maxId() -> int: with oil.open() as db, db.cursor() as curs: curs.execute('select max(id) from requestLog') r = curs.fetchone() return r[0] return -1