Example #1
0
def scrape(source: WebSource, workerId: int, url: str, triesLeft: int = 3
		) -> Web:
	if triesLeft <= 0:
		raise Exception('scrape: exceeded retry count')

	created = int(time.time() * 1000)
	r = make_req(workerId, url)

	w = Web(created_=created, url_=url, status_=r.status, sourceId_=source.id)
	w.response = r.response
	w.requestHeaders = None # str(r.request.headers).encode('utf-8')
	w.responseHeaders = None # str(r.headers).encode('utf-8')

	if w.status == 200:
		dec = enc.decode(w.response, url)
		if dec is not None and dec[0] is not None:
			w.encoding = Encoding.lookup(oil.open(), dec[0]).id

		if dec is not None and dec[1] is not None:
			title = ''
			try:
				title = extractTitle(dec[1]).strip()
			except:
				pass
			if title == 'Just a moment...' \
					or title == 'Attention Required! | Cloudflare':
				plog(f'scrape: got 200 status CF page, retrying: {triesLeft - 1}')
				time.sleep(9 + random.random() * 2)
				return scrape(source, workerId, url, triesLeft - 1)

	w.save(oil.open())
	return w
Example #2
0
def work(workerId: int, stripeCount: int, stripe: int, blockSize: int) -> int:
	node = get_node_name(workerId)
	while True:
		if maybe_restart_vpn(node):
			plog('work: restarted vpn; restarting fr')
			restart_worker_fr(node)

		if not get_fr_ok(workerId):
			plog('work: fr is not ok; attempting to restart fr')
			restart_worker_fr(node)
			if not get_fr_ok(workerId):
				plog('work: fr still not ok; aborting')
				return 2

		if 'skitter' not in get_sessions_list(workerId):
			global defaultUserAgent
			plog('work: skitter not in sessions list; trying to create it...')
			create_session(workerId, 'skitter', defaultUserAgent)

		if 'skitter' not in get_sessions_list(workerId):
			plog('work: skitter session does not exist; aborting')
			return 3

		remoteIP = get_worker_ip_fr(workerId)
		plog(f'work: {remoteIP=}')
		source = WebSource.lookup(oil.open(), f'{NODE_NAME}_{node}', remoteIP)
		plog(f'work: source: {source.__dict__}')
		if source.isLocal() or source.source is None \
				or source.source.startswith(publicIp):
			plog('work: source is local; aborting')
			return 1

		WebQueue.resetWorker(oil.open(), workerId)
		workBlock(workerId, stripeCount, stripe, blockSize, source)
Example #3
0
def v0_crawl() -> ResponseReturnValue:
	apiKey = get_request_value('apiKey', '')
	if apiKey not in API_KEYS:
		return make_response({'err':-401,'msg':'unauthorized'}, 401)

	q = get_request_value('q', None)
	print(f'v0_crawl: {q=}')
	if (q is None or len(q.strip()) < 1):
		return page_not_found(NotFound())

	if not q.startswith('http://') and not q.startswith('https://'):
		return page_not_found(NotFound())

	ts = int(time.time()) - 1
	db = oil.open()
	scraper = RemoteWebScraper(db)
	scraper.scrape(q)
	latest = Web.latest(db, q, status=200)
	if latest is None or latest.created is None:
		print(f'v0_crawl: {q=}: error: no latest entry')
		return make_response({'err':-500,'msg':'internal server error'}, 500)
	lts = int(latest.created//1000)
	if lts < ts:
		print(f'v0_crawl: {q=}: error getting fresh crawl: {ts} >= {lts}')
		return make_response({'err':-500,'msg':'internal server error'}, 500)
	return make_response_web(latest)
Example #4
0
def v0_cache() -> ResponseReturnValue:
	apiKey = get_request_value('apiKey', '')
	if apiKey not in API_KEYS:
		return make_response({'err':-401,'msg':'unauthorized'}, 401)

	q = get_request_value('q', None)
	u = get_request_value('u', None)
	print(f'v0_cache: {q=}, {u=}')
	if (q is None or len(q.strip()) < 1) \
			and (u is None or len(u.strip()) < 1):
		print(f'v0_cache: q and u are empty')
		return page_not_found(NotFound())

	latest = None
	db = oil.open()
	if u:
		latest = Web.latest(db, ulike=u, status=200)
	else:
		latest = Web.latest(db, q, status=200)

	if latest is None or latest.response is None or latest.created is None:
		print(f'v0_cache: {q=}, {u=}: not found')
	else:
		print(f'v0_cache: {q=}, {u=}: found: len: {len(latest.response)}, url: {latest.url}, id: {latest.id}, created: {latest.created}')
	return make_response_web(latest)
Example #5
0
	def save(sourceId: int, authorId: int, reason: int) -> None:
		with oil.open() as db, db.cursor() as curs:
			curs.execute('''
				insert into authorBlacklist(sourceId, authorId, reason)
				values(%s, %s, %s)
				on conflict(sourceId, authorId, reason) do
				update set updated = current_timestamp
				''', (sourceId, authorId, reason))
Example #6
0
	def check(sourceId: int, authorId: int, reason: Optional[int] = None) -> bool:
		with oil.open() as db, db.cursor() as curs:
			curs.execute('''
				select sourceId, authorId from authorBlacklist
				where sourceId = %s and authorId = %s
					and ((%s is null or reason = %s) or sourceId = 19)
			''', (sourceId, authorId, reason, reason))
			return len(curs.fetchall()) > 0
Example #7
0
	def save(urlId: str, reason: int) -> None:
		with oil.open() as db, db.cursor() as curs:
			curs.execute('''
				insert into ficBlacklist(urlId, reason)
				values(%s, %s)
				on conflict(urlId, reason) do
				update set updated = current_timestamp
				''', (urlId, reason))
Example #8
0
	def select(urlId: Optional[str] = None) -> List['FicBlacklist']:
		with oil.open() as db, db.cursor() as curs:
			curs.execute('''
				select urlId, created, updated, reason
				from ficBlacklist
				where %s is null or urlId = %s
			''', (urlId, urlId))
			return [FicBlacklist(*r) for r in curs.fetchall()]
Example #9
0
	def select(urlId: Optional[str] = None) -> List['FicInfo']:
		with oil.open() as db, db.cursor() as curs:
			curs.execute(f'''
				select {FicInfo.selectList()}
				from ficInfo {FicInfo.tableAlias}
				where %s is null or id = %s
			''', (urlId, urlId))
			return [FicInfo(*r) for r in curs.fetchall()]
Example #10
0
	def select(isAutomated: bool, route: str, description: str
			) -> Optional['RequestSource']:
		with oil.open() as db, db.cursor() as curs:
			curs.execute('''
				select rs.id, rs.created, rs.isAutomated, rs.route, rs.description
				from requestSource rs
				where rs.isAutomated = %s and route = %s and description = %s
			''', (isAutomated, route, description))
			r = curs.fetchone()
			return None if r is None else RequestSource(*r)
Example #11
0
	def select(sourceId: Optional[int] = None, authorId: Optional[int] = None
			) -> List['AuthorBlacklist']:
		with oil.open() as db, db.cursor() as curs:
			curs.execute('''
				select sourceId, authorId, created, updated, reason
				from authorBlacklist
				where (%s is null or sourceId = %s)
					and (%s is null or authorId = %s)
			''', (sourceId, sourceId, authorId, authorId))
			return [AuthorBlacklist(*r) for r in curs.fetchall()]
Example #12
0
    def select(id_: int) -> 'RequestSource':
        with oil.open() as db, db.cursor() as curs:
            curs.execute(
                '''
				select rs.id, rs.created, rs.isAutomated, rs.route, rs.description
				from requestSource rs
				where rs.id = %s
			''', (id_, ))
            r = curs.fetchone()
            return None if r is None else RequestSource(*r)
Example #13
0
	def insert(source: RequestSource, etype: str, query: str, infoRequestMs: int,
			urlId: Optional[str], ficInfo: Optional[str], exportMs: Optional[int],
			exportFileName: Optional[str], exportFileHash: Optional[str],
			url: Optional[str]) -> None:
		with oil.open() as db, db.cursor() as curs:
			curs.execute('''
				insert into requestLog(sourceId, etype, query, infoRequestMs, urlId,
					ficInfo, exportMs, exportFileName, exportFileHash, url)
				values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
				''', (source.id, etype, query, infoRequestMs, urlId, ficInfo, exportMs,
					exportFileName, exportFileHash, url))
Example #14
0
def workBlock(workerId: int, stripeCount: int, stripe: int, cnt: int,
		source: WebSource) -> None:
	defaultTimeout = 600.0
	idleTimeout = 0.25
	timeout = defaultTimeout
	while cnt > 0 and timeout > 0:
		wq = WebQueue.next(oil.open(), workerId, 0, stripeCount=stripeCount, stripe=stripe)
		if wq is None:
			timeout -= idleTimeout
			time.sleep(idleTimeout)
			continue
		assert(wq.url is not None)
		plog(f'workBlock: {cnt}: {wq.url}')
		cnt -= 1
		timeout = defaultTimeout
		w = scrape(source, workerId, wq.url)
		wq.dequeue(oil.open())
		rlen = -1 if w.response is None else len(w.response)
		plog(f'workBlock:   status {w.status}, {rlen} bytes')

		time.sleep(9 + random.random() * 2)
Example #15
0
	def lookup(urlId: str, version: int, etype: str, inputHash: str
			) -> Optional['ExportLog']:
		with oil.open() as db, db.cursor() as curs:
			curs.execute('''
				select *
				from exportLog e
				where e.urlId = %s
					and e.version = %s
					and e.etype = %s
					and e.inputHash = %s
				''', (urlId, version, etype, inputHash))
			r = curs.fetchone()
			return ExportLog(*r[:ExportLog.fieldCount]) if r is not None else None
Example #16
0
	def mostRecentByUrlId(etype: str, urlId: str) -> Optional['RequestLog']:
		with oil.open() as db, db.cursor() as curs:
			curs.execute('''
				select r.id, r.created, r.sourceId, r.etype, r.query, r.infoRequestMs,
					r.urlId, r.ficInfo, r.exportMs, r.exportFileName, r.exportFileHash,
					r.url
				from requestLog r
				where r.etype = %s and r.urlId = %s
				order by r.created desc
				limit 1
				''', (etype, urlId))
			r = curs.fetchone()
			return None if r is None else RequestLog(*r)
Example #17
0
def v0_cache() -> ResponseReturnValue:
    remoteAddr = get_remote_addr()
    apiKey = get_request_value('apiKey', None)

    db = oil.open()
    limiter = get_limiter(db, remoteAddr, apiKey)
    retryAfterResponse = limiter.retryAfterResponse(db, .1)
    if retryAfterResponse is not None:
        return retryAfterResponse

    res = v0_cache_internal()
    if res is not None:
        return res
    return make_response({'err': -404, 'msg': 'not found'}, 404)
Example #18
0
def v0_soft_crawl() -> ResponseReturnValue:
    remoteAddr = get_remote_addr()
    apiKey = get_request_value('apiKey', None)

    db = oil.open()
    limiter = get_limiter(db, remoteAddr, apiKey)
    retryAfterResponse = limiter.retryAfterResponse(db, .1)
    if retryAfterResponse is not None:
        return retryAfterResponse

    res = v0_cache_internal()
    if res is not None:
        return res
    return v0_crawl()
Example #19
0
    def mostRecentByUrlId(urlId):
        with oil.open() as db, db.cursor() as curs:
            curs.execute(
                '''
			select r.id, r.created, r.sourceId, r.etype, r.query, r.infoRequestMs,
				r.urlId, r.ficInfo, r.exportMs, r.exportFileName, r.exportFileHash,
				r.url
			from requestLog r
			where urlId = %s
			order by created desc limit 1''', (urlId, ))
            r = curs.fetchone()
            if r is None:
                return None
            return RequestLog(*r)
Example #20
0
	def upsert(isAutomated: bool, route: str, description: str
			) -> 'RequestSource':
		existing = RequestSource.select(isAutomated, route, description)
		if existing is not None:
			return existing
		with oil.open() as db, db.cursor() as curs:
			curs.execute('''
				insert into requestSource(isAutomated, route, description)
				values (%s, %s, %s)
				on conflict(isAutomated, route, description) do nothing
			''', (isAutomated, route, description))
		src = RequestSource.select(isAutomated, route, description)
		if src is None:
			raise Exception(f'RequestSource.upsert: failed to upsert')
		return src
Example #21
0
	def upsert(self) -> 'ExportLog':
		with oil.open() as db, db.cursor() as curs:
			curs.execute('''
				insert into exportLog(urlId, version, etype, inputHash, exportHash)
				values(%s, %s, %s, %s, %s)
				on conflict(urlId, version, etype, inputHash) do
				update set exportHash = EXCLUDED.exportHash
				where exportLog.created < EXCLUDED.created
				''', (self.urlId, self.version, self.etype, self.inputHash,
					self.exportHash))
		l = ExportLog.lookup(self.urlId, self.version, self.etype, self.inputHash)
		assert(l is not None)
		self.exportHash = l.exportHash
		self.created = l.created
		return self
Example #22
0
	def check(urlId: str, reason: Optional[int] = None) -> bool:
		with oil.open() as db, db.cursor() as curs:
			curs.execute('''
				select 1
				from ficInfo fi
				left join ficBlacklist fb
					on fb.urlId = fi.id
					and (%s is null or fb.reason = %s)
				left join authorBlacklist ab
					on ab.sourceId = fi.sourceId
					and ab.authorId = fi.authorId
					and (%s is null or ab.reason = %s)
				where fi.id = %s
					and ((fb.reason is not null or ab.reason is not null) or fi.sourceId = 19)
			''', (reason, reason, reason, reason, urlId))
			return len(curs.fetchall()) > 0
Example #23
0
    def fetchAfter(after):
        with oil.open() as db, db.cursor() as curs:
            curs.execute(
                '''
				select r.id, r.created, r.sourceId, r.etype, r.query, r.infoRequestMs,
					r.urlId, r.ficInfo, r.exportMs, r.exportFileName, r.exportFileHash,
					r.url
				from requestLog r
				where id > %s and (r.exportFileHash is null or not exists (
					select 1
					from requestLog r2
					where r2.exportFileHash = r.exportFileHash
						and r2.id < r.id
				))
				''', (after, ))
            ls = [RequestLog(*r) for r in curs.fetchall()]
            return ls
        return []
Example #24
0
def v0_status() -> ResponseReturnValue:
    remoteAddr = get_remote_addr()
    apiKey = get_request_value('apiKey', None)

    db = oil.open()
    limiter = get_limiter(db, remoteAddr, apiKey)
    retryAfterResponse = limiter.retryAfterResponse(db, .1)
    if retryAfterResponse is not None:
        return retryAfterResponse

    limiter = limiter.refresh(db)
    return make_response({
        'err': 0,
        'status': 'ok',
        'pid': os.getpid(),
        'tident': threading.get_ident(),
        'burst': int(math.floor(limiter.burst())),
        'flow': limiter.flow,
        'anon': limiter.isAnon()
    })
Example #25
0
def main(args: List[str]) -> int:
	global logFileName
	stripeCount = 1
	stripe = 0
	workerId = int(sys.argv[1])
	logFileName = f'skitter_worker_{workerId}.log'

	global publicIp
	publicIp = get_public_ip()
	plog(f'main: public ip: {publicIp}')

	global blockSize

	run = True
	while run:
		doTryReset = False
		try:
			work(workerId, stripeCount, stripe, blockSize)
		except KeyboardInterrupt:
			run = False
		except Exception as e:
			plog(f'work: exception:\n{e}\n{traceback.format_exc()}')
			doTryReset = True

		try:
			WebQueue.resetWorker(oil.open(), workerId)
		except KeyboardInterrupt:
			run = False
		except:
			pass

		if doTryReset:
			plog('main: sleeping then reseting worker')
			time.sleep(30 + random.random() * 30)
			try_reset(workerId)

		if run:
			plog('main: sleeping then restarting work loop')
			time.sleep(60 + random.random() * 90)

	return 0
Example #26
0
	def save(ficInfo: Dict[str, str]) -> None:
		with oil.open() as db, db.cursor() as curs:
			fi = FicInfo.parse(ficInfo)
			curs.execute('''
				insert into ficInfo(
					id, title, author, chapters, words, description, ficCreated,
					ficUpdated, status, source, extraMeta, sourceId, authorId,
					contentHash)
				values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
				on conflict(id) do
				update set updated = current_timestamp,
					title = EXCLUDED.title, author = EXCLUDED.author,
					chapters = EXCLUDED.chapters, words = EXCLUDED.words,
					description = EXCLUDED.description,
					ficCreated = EXCLUDED.ficCreated, ficUpdated = EXCLUDED.ficUpdated,
					status = EXCLUDED.status, source = EXCLUDED.source,
					extraMeta = EXCLUDED.extraMeta, sourceId = EXCLUDED.sourceId,
					authorId = EXCLUDED.authorId, contentHash = EXCLUDED.contentHash
				''', (fi.id, fi.title, fi.author, fi.chapters, fi.words,
					fi.description, fi.ficCreated, fi.ficUpdated, fi.status, fi.source,
					fi.extraMeta, fi.sourceId, fi.authorId, fi.contentHash))
Example #27
0
        for w in Web.fetchIdRange_g(db,
                                    wid_s,
                                    wid_e,
                                    ulike=ffnLike,
                                    status=200):
            if w.response is None or len(w.response) < 1:
                continue
            assert (w.url is not None and w.created is not None)

            dec = enc.decode(w.response, w.url)
            if dec is None:
                continue
            html = dec[1]

            ts = int(w.created / 1000)
            html = f"<!--\t{ts}\t{w.url}\t-->\n" + html

            s = io.BytesIO(html.encode('utf-8'))
            ti = tarfile.TarInfo(name=f"./{w.id}.html")
            ti.mtime = int(w.created // 1000)
            ti.size = len(html.encode('utf-8'))
            xzf.addfile(tarinfo=ti, fileobj=s)

    return 0


if __name__ == '__main__':
    with oil.open() as db:
        res = main(db)
    sys.exit(res)
Example #28
0
def v0_crawl_internal(prefixMunges: List[Tuple[str, str]],
                      validPrefixes: List[str]) -> ResponseReturnValue:
    remoteAddr = get_remote_addr()
    apiKey = get_request_value('apiKey', None)

    q = get_request_value('q', None)
    if q is not None and len(q) > 4096:
        q = q[:4096]
    print(f'v0_crawl_internal: {q=}')

    if DISABLE_CRAWLING:
        print(f'v0_crawl_internal: temporarily disabled, 503')
        retryAfter = 300
        res = make_response(
            {
                'err': -503,
                'msg': 'service unavailable',
                'retryAfter': retryAfter
            }, 503)
        res.headers['Retry-After'] = retryAfter
        return res

    db = oil.open()
    limiter = get_limiter(db, remoteAddr, apiKey)
    WeaverRequestLog.log(db, limiter.id, q)

    if limiter.isAnon():
        glimiter = WeaverLimiter.select(db, 'global_anon')
        if glimiter is None:
            return make_response({'err': -5, 'msg': 'no global limiter'}, 500)
        retryAfterResponse = glimiter.retryAfterResponse(db, 1)
        if retryAfterResponse is not None:
            return retryAfterResponse

    retryAfterResponse = limiter.retryAfterResponse(db, 1)
    if retryAfterResponse is not None:
        return retryAfterResponse

    if (q is None or len(q.strip()) < 1):
        return make_response({'err': -1, 'msg': 'missing q param'}, 400)

    for munge in prefixMunges:
        if q.startswith(munge[0]):
            q = munge[1] + q[len(munge[0]):]

    validPrefix = False
    for pre in validPrefixes:
        if q.startswith(pre):
            validPrefix = True
            break
    if not validPrefixes:
        return make_response({
            'err': -2,
            'msg': 'url is invalid',
            'arg': q
        }, 400)

    try:
        global defaultRequestTimeout, defaultUserAgent
        global skitterBaseUrl, skitterApiKey, skitterUser

        cookies: Dict[str, str] = {}
        headers = {'User-Agent': defaultUserAgent}
        url = urllib.parse.urljoin(skitterBaseUrl, 'v0/crawl')

        r = requests.get(url,
                         headers=headers,
                         cookies=cookies,
                         params={'q': q},
                         data={'apiKey': skitterApiKey},
                         auth=skitterUser,
                         timeout=defaultRequestTimeout)
        if r.status_code != 200:
            return make_response({
                'err': -3,
                'msg': 'skitter error',
                'arg': q
            }, 500)

        fres = make_response(r.content)
        for rh in r.headers:
            if rh.startswith('X-Weaver'):
                fres.headers[rh] = r.headers[rh]
        return fres
    except Exception as e:
        print(
            f'v0_crawl_internal: exception {q}: {e}\n{traceback.format_exc()}')

    return make_response({'err': -4, 'msg': 'no return', 'arg': q}, 500)
Example #29
0
#!/usr/bin/env python
import sys
import traceback
from typing import Set, Dict, Optional, Any
from bs4 import BeautifulSoup  # type: ignore
import urllib.parse
from oil import oil
from weaver import RemoteWebScraper, WebQueue
import weaver.enc as enc
from minerva import FFNLanguage, FFNCategory, FFNGenre, FFNFandom, FFNCharacter

db = oil.open()
scraper = RemoteWebScraper(db)
#scraper.baseDelay = 30
scraper.requestTimeout = 300
scraper.mustyThreshold = 60 * 60 * 24 * 30 * 1


def stripAfter(s: str, needle: str) -> str:
    idx = s.find(needle)
    if idx < 0:
        return s
    return s[:idx]


baseUrl = 'https://www.fanfiction.net'
baseCrossoverUrl = 'https://www.fanfiction.net/crossovers'


def getCategories(scraper: RemoteWebScraper) -> Set[str]:
    categoryBlacklist = [
Example #30
0
 def maxId() -> int:
     with oil.open() as db, db.cursor() as curs:
         curs.execute('select max(id) from requestLog')
         r = curs.fetchone()
         return r[0]
     return -1