Exemple #1
0
def getLastPage(db: 'psycopg2.connection', scraper: WebScraper, url: str
		) -> int:
	w = scraper.softScrape(url)
	dec = enc.decode(w.response, url)
	if dec is None:
		raise Exception(f"uhoh {w.url}")
	html = dec[1]
	soup = BeautifulSoup(html, 'html5lib')
	lcWrap = soup.find('div', { 'class': 'lc-wrapper' })
	if lcWrap is None:
		return 1
	maxSeen = 1
	stub = '/'.join([''] + url.split('/')[3:6] + [''])
	for a in lcWrap.findAll('a'):
		if a is None or a.getText() is None:
			continue
		href = a.get('href')
		if href is not None:
			if href.startswith(stub):
				maxSeen = max(maxSeen, int(href.split('/')[-2]))
		if a.getText().strip() != 'Last':
			continue
		ps = href.split('/')
		return int(ps[-2])
	return maxSeen
def prescrapeCommunity(db: 'psycopg2.connection', scraper: WebScraper,
                       comm: FFNCommunity) -> None:
    assert (comm.id is not None)
    deathCode = FFNCommunity.isDead(db, comm.id)
    if deathCode != 0:
        plog(
            f"skipping community {comm.id} {comm.stub}, already dead: {deathCode}"
        )
        return

    plog(f"prescraping community {comm.id} {comm.stub}")
    # grab the first page to get counts
    url = comm.getUrl()
    w = scraper.softScrape(url)
    dec = enc.decode(w.response, url)
    if dec is None:
        plog("  {comm.id} has unknown encoding")
        return
    html = dec[1]
    if len(html) < 1:
        plog(f"  {comm.id} is freshly dead: 1")
        FFNCommunity.bury(db, comm.id, 1, w.created)

    pages = getPageCount(comm, html)
    if pages > 1:
        plog(f"  total pages: {pages}")
    for page in range(1, pages + 1):
        if pages > 1:
            plog(f"    grabbing page {page}/{pages}")
        scraper.softScrape(comm.getUrl(page))
Exemple #3
0
def lookupAbbreviatedFandoms() -> None:
    global fandomStubMap
    ks = [k for k in dict.keys(fandomStubMap)]
    ks.sort()
    ks.reverse()
    cnt = 0
    for k in ks:
        if not fandomStubMap[k].endswith('...'):
            continue
        #print(f"{k}: {fandomStubMap[k]}")
        purl = f"{baseUrl}/{k}"
        #print(purl)
        cnt += 1

        w = scraper.softScrape(purl)
        assert (w.url is not None)
        dec = enc.decode(w.response, w.url)
        assert (dec is not None)
        html = dec[1]
        soup = BeautifulSoup(html, 'html5lib')
        title = soup.find('title').getText().strip()
        sufs = [' | FanFiction', 'FanFiction Archive']
        hasSuf = True
        while hasSuf:
            hasSuf = False
            for suf in sufs:
                if title.endswith(suf):
                    hasSuf = True
                    title = title[:-len(suf)].strip()
                    break
        #print(f"{k} => {title}")
        fandomStubMap[k] = title
def scrape(source: WebSource, workerId: int, url: str, triesLeft: int = 3
		) -> Web:
	if triesLeft <= 0:
		raise Exception('scrape: exceeded retry count')

	created = int(time.time() * 1000)
	r = make_req(workerId, url)

	w = Web(created_=created, url_=url, status_=r.status, sourceId_=source.id)
	w.response = r.response
	w.requestHeaders = None # str(r.request.headers).encode('utf-8')
	w.responseHeaders = None # str(r.headers).encode('utf-8')

	if w.status == 200:
		dec = enc.decode(w.response, url)
		if dec is not None and dec[0] is not None:
			w.encoding = Encoding.lookup(oil.open(), dec[0]).id

		if dec is not None and dec[1] is not None:
			title = ''
			try:
				title = extractTitle(dec[1]).strip()
			except:
				pass
			if title == 'Just a moment...' \
					or title == 'Attention Required! | Cloudflare':
				plog(f'scrape: got 200 status CF page, retrying: {triesLeft - 1}')
				time.sleep(9 + random.random() * 2)
				return scrape(source, workerId, url, triesLeft - 1)

	w.save(oil.open())
	return w
Exemple #5
0
def prescrape(scraper: RemoteWebScraper, url: str) -> None:
	print(url)
	w = scraper.softScrape(url)
	assert(w.url is not None)
	dec = enc.decode(w.response, w.url)
	if dec is None:
		raise Exception("unknown encoding")
	html = dec[1]
	print(f'  len: {len(html)}')
	print(html)
def main(db: 'psycopg2.connection') -> None:
    if len(sys.argv) != 2:
        raise Exception("expected wid")

    wid = int(sys.argv[1])

    some = Web.fetchIdRange(db, wid, wid + 1)
    if len(some) != 1:
        raise Exception("TODO")
    w = some[0]
    assert (w.url is not None and w.created is not None)

    if not w.url.startswith('https://www.fanfiction.net/s/'):
        raise Exception("not a ffn url")

    fid = int(w.url.split('/')[4])
    print(f"fid: {fid}")

    response = w.response
    if response is None and w.wbaseId is not None:
        wbase = WebBase.lookup(db, w.wbaseId)
        if wbase is None:
            raise Exception("has null web_base")
        response = wbase.response

    if response is None or len(response) < 1:
        print("response is null")
        return

    dec = enc.decode(response, w.url)
    if dec is None:
        raise Exception("unknown encoding")
    html = dec[1]

    code = extractFFNDeathCode(html)
    if code != 0:
        plog(f"  dead: {code}")
        c = FFNFic.bury(db, fid, code, w.created, True)
        print(c)
        #print(html)
    else:
        plog(f"  {fid} healthy?")
        print(html)
        try:
            ffnParser = FFNParser()
            ts = int(w.created / 1000)
            fic = ffnParser.get(db, fid, ts, BeautifulSoup(html, 'html5lib'))
            plog(f"{fic.__dict__}")
        except:
            plog(f"{w.url} is broken")
            #with open(f"./edump_{fid}_{cid}.html", 'w') as f:
            #	f.write(html)
            raise
def prescrapeUid(db: 'psycopg2.connection', scraper: WebScraper,
                 uid: int) -> None:
    plog(f"prescraping uid {uid}")
    url = getUrl(uid)
    w = scraper.softScrape(url)
    dec = enc.decode(w.response, url)
    if dec is None:
        plog("  {uid} has unknown encoding")
        return
    html = dec[1]
    code = extractFFNUserDeathCode(html)
    if code != 0:
        plog(f"  {uid} is freshly dead: {code}")
        FFNUser.bury(db, uid, code, w.created)
def dumpRequest(w: Web, f: IO) -> None:
    assert (w.url is not None and w.created is not None)
    #plog(f"{w.url} {len(w.response)}")
    url = w.url
    ts = int(w.created / 1000)

    dec = enc.decode(w.response, w.url)
    if dec is None:
        raise Exception("unknown encoding")
    html = dec[1]

    # try to abbreviate down to just the meta info
    realStartIdx = html.find('id=pre_story_links')
    if realStartIdx > -1:
        realStartIdx = html.rfind('<div', 0, realStartIdx)
    startIdx = -2
    if realStartIdx > -1:
        startIdx = html.find('<div id=profile_top', realStartIdx)
    endIdx = -3
    if startIdx > -1:
        endIdx = html.find("class='lc-wrapper'", startIdx)
        if endIdx < 0:
            endIdx = html.find("id='storytextp'", startIdx)
        if endIdx > startIdx and startIdx > realStartIdx:
            html = html[realStartIdx:endIdx] + '>'

    soup = BeautifulSoup(html, 'html5lib')
    profile_top = soup.find(id='profile_top')
    if profile_top is None:
        return

    for t in ['script']:
        for e in soup.findAll(t):
            e.decompose()

    fid = url[len(urlPrefix):].split('/')[0]
    cid = url[len(urlPrefix):].split('/')[1]

    profile_top['id'] = f"profile_top_{fid}_{cid}"
    profile_top['data-fetched'] = ts
    profile_top_str = str(profile_top)

    f.write(f"<!-- start wid {w.id} -->\n".encode('utf-8'))
    #f.write(profile_top_str.encode('utf-8'))
    f.write(soup.find('body').encode_contents())
    f.write(f"<!-- {w.id} end wid -->\n".encode('utf-8'))

    return
def prescrape(scraper: WebScraper, url: str) -> None:
    print(f"url: {url}")
    w = scraper.softScrape(url)
    responseSize = len(w.response) if w.response is not None else 0
    print(f"\tresponse size: {responseSize}B")
    print(f"\trequest headers: {w.requestHeaders!r}")
    print(f"\tresponse headers: {w.responseHeaders!r}")

    dec = enc.decode(w.response, url)
    if dec is None:
        print("\tunknown encoding")
        return
    print(f"\tencoding: {dec[0]}")
    html = dec[1]
    soup = BeautifulSoup(html, 'html5lib')
    print(f"\tdecoded size: {len(html)}B")
Exemple #10
0
    def scrape(self, url: str) -> Web:
        if self.staleOnly:
            logMessage(f'staleScrape|{url}', 'scrape.log')
            wl = Web.latest(self.db, url)
            if wl is None:
                raise Exception(f'failed to stale scrape url: {url}')
            return wl

        logMessage(f'scrape|{url}', 'scrape.log')
        created = int(time.time()) * 1000
        w = Web(created_=created, url_=url, sourceId_=self.source.id)

        try:
            import requests
            global defaultRequestTimeout
            r = requests.get(url,
                             headers=self.headers,
                             cookies=self.cookies,
                             timeout=defaultRequestTimeout)
            w.status = r.status_code
            w.response = (r.content)
            w.requestHeaders = str(r.request.headers).encode('utf-8')
            w.responseHeaders = str(r.headers).encode('utf-8')
            w.finalUrl = r.url
        except:
            logMessage(f'scrape|exception|{url}', 'scrape.log')
            raise

        fuzz = getFuzz()
        # subtract out request time from fuzz
        fuzz -= (int(time.time() * 1000) - created) / 1000
        # TODO: delay *before* scrape based on domain
        time.sleep(max(fuzz, .1) + getFuzz(0.01, 0.1))

        self.last_ts = created

        if w.status != 200:
            w.save(self.db)
            raise Exception(f'failed to download url {w.url}: {w.status}')

        dec = enc.decode(w.response, url)
        if dec is not None and dec[0] is not None:
            w.encoding = Encoding.lookup(self.db, dec[0]).id

        w.save(self.db)
        return w
def prescrapeFid(db: 'psycopg2.connection', scraper: RemoteWebScraper,
                 fid: int, cid: int) -> None:
    plog(f"prescraping fid {fid} cid {cid}")
    code = FFNFic.isDead(db, fid)
    if code != 0:
        plog(f"  {fid} is dead: {code}")
        return
    url = getUrl(fid, cid)
    w = scraper.softScrape(url)
    dec = enc.decode(w.response, url)
    if dec is None:
        plog("  {fid}/{cid} has unknown encoding")
        return
    html = dec[1]
    code = extractFFNDeathCode(html)
    if code != 0:
        plog(f"  {fid} is freshly dead: {code}")
        FFNFic.bury(db, fid, code)
def processCategory(db: 'psycopg2.connection', scraper: RemoteWebScraper,
                    category: FFNCategory) -> None:
    assert (category.id is not None)
    url = category.getCrossoverUrl()
    print(url)
    w = scraper.softScrape(url)
    dec = enc.decode(w.response, url)
    if dec is None:
        plog("  {category.id} has unknown encoding")
        return
    html = dec[1]
    if len(html) < 1:
        plog(f"  {category.id} is freshly dead: 1")
        return

    baseUrl = 'https://www.fanfiction.net'
    soup = BeautifulSoup(html, 'html5lib')
    for a in soup.findAll('a'):
        if not a.has_attr('href'):
            continue
        href = str(a.get('href'))
        if not href.startswith('/crossovers/'):
            continue
        parts = href.split('/')
        if len(parts) != 5:
            continue

        stub = parts[2]
        fandomId = int(parts[3])
        name = str(a.text)

        if len(stub) > 254:
            continue  # TODO oh god why
            # https://www.fanfiction.net/anime/Do-You-Love-Your-Mom-and-Her-Two-Hit-Multi-Target-Attacks%3F-%E9%80%9A%E5%B8%B8%E6%94%BB%E6%92%83%E3%81%8C%E5%85%A8%E4%BD%93%E6%94%BB%E6%92%83%E3%81%A7%E4%BA%8C%E5%9B%9E%E6%94%BB%E6%92%83%E3%81%AE%E3%81%8A%E6%AF%8D%E3%81%95%E3%82%93%E3%81%AF%E5%A5%BD%E3%81%8D%E3%81%A7%E3%81%99%E3%81%8B%EF%BC%9F/?&srt=1&r=10

        print(baseUrl + href)
        print(f"{fandomId} {stub} => {name}")

        ffnFandom = FFNFandom.lookup(db, category.id, stub, remoteId=fandomId)
        print(f"{ffnFandom.remoteId} {ffnFandom.stub} => {ffnFandom.name}")

        ffnFandom.markHasCrossovers(db)
        print(ffnFandom.getAllCrossoversUrl())
Exemple #13
0
def refreshMeta(db: 'psycopg2.connection', scraper: RemoteWebScraper,
                fid: int) -> int:
    plog(f"  refreshing fid {fid} meta")

    fic = FFNFic.lookup(db, fid)
    if fic is not None and fic.chapterCount is not None:
        plog(f"    old chapterCount: {fic.chapterCount}")

    url = getUrl(fid, 1)
    w = scraper.scrape(url)

    assert (w.url is not None and w.created is not None)

    response = w.response
    if response is None and w.wbaseId is not None:
        wbase = WebBase.lookup(db, w.wbaseId)
        if wbase is None:
            raise Exception("has null web_base")
        response = wbase.response

    if response is None or len(response) < 1:
        raise Exception(f'refreshMeta: unable to find response for {fid}')

    dec = enc.decode(response, w.url)
    if dec is None:
        raise Exception("unknown encoding")
    html = dec[1]

    code = extractFFNDeathCode(html)
    if code != 0:
        plog(f"  dead: {code}")
        c = FFNFic.bury(db, fid, code, w.created, True)
        return code

    try:
        ffnParser = FFNParser()
        ts = int(w.created / 1000)
        pfic = ffnParser.get(db, fid, ts, BeautifulSoup(html, 'html5lib'))
    except:
        raise

    return 0
Exemple #14
0
def testLid(db: 'psycopg2.connection', lid: int) -> None:
    url = f'https://www.fanfiction.net/s/{lid}/1'
    scraper = RemoteWebScraper(db)
    w = scraper.softScrape(url)
    assert (w.created is not None)

    dec = enc.decode(w.response, url)
    if dec is None:
        plog("  {url} has unknown encoding")
        sys.exit(1)
    html = dec[1]
    code = extractFFNDeathCode(html)
    if code != 0:
        plog(f"  {url} is freshly dead: {code}")
        return

    soup = BeautifulSoup(html, 'html5lib')
    parser = minerva.ffn.parser.FFNParser()
    fic = parser.get(db, lid, w.created // 1000, soup)
    print(fic.__dict__)
def main(db: 'psycopg2.connection') -> int:
    if len(sys.argv) != 3:
        raise Exception("expected wid range")

    wid_s = int(sys.argv[1])
    wid_e = int(sys.argv[2])

    some = Web.fetchIdRange(db, wid_s, wid_e)
    for w in some:
        if w.response is None or len(w.response) < 1:
            continue
        assert (w.url is not None)

        dec = enc.decode(w.response, w.url)
        if dec is None:
            continue
        html = dec[1]
        with open(f"./{w.id}.html", "w") as f:
            f.write(html)

    return 0
Exemple #16
0
def getCategories(scraper: RemoteWebScraper) -> Set[str]:
    categoryBlacklist = [
        'support', 'cookies', 'privacy', 'tos', 'betareaders', 'forums',
        'communities', 'j', '', 'u', 's', 'crossovers'
    ]
    categories: Set[str] = set()

    root = scraper.softScrape(baseUrl)
    assert (root.url is not None)

    dec = enc.decode(root.response, root.url)
    assert (dec is not None)
    html = dec[1]
    soup = BeautifulSoup(html, 'html5lib')
    #print(len(html))

    for a in soup.findAll('a'):
        href = urllib.parse.urljoin(baseUrl, a.get('href'))
        href = stripAfter(href, '#')
        href = stripAfter(href, '?')
        if not href.startswith(baseUrl):
            continue

        end = href[len(baseUrl):]
        if end.find('/') < 0:
            continue

        category = end.split('/')[1]
        if category in categoryBlacklist:
            continue

        ffnCategory = FFNCategory.lookup(db, category, a.getText().strip())
        #print(f"{category}: {ffnCategory.id} {ffnCategory.name}")

        categories |= {category}

        #print(category)
        #print(f"{a.get('href')} {href}")

    return categories
def handleContent(db: 'psycopg2.connection', c: FFNFicContent
		) -> Optional[Dict[str, Any]]:
	#if fid % stripeCount != stripe: return
	id_ = f'{c.fid}/{c.cid}'
	dec = enc.decode(c.content, id_)
	if dec is None:
		raise Exception("unknown encoding")
	html = dec[1]

	try:
		# try to grab just the story content
		md = htmlToMd(html)

		return { '_id': id_, 'fid': c.fid, 'cid': c.cid, 'content': md, }
		#res = es.index(index="ffn", id=id_, body=doc)
		#print(res['result'])
	except:
		plog(f"{c.wid} is broken")
		with open(f"./edump/edump_wid_{c.wid}.html", 'w') as f:
			f.write(html)
		plog(traceback.format_exc())
	return None
Exemple #18
0
def handleStoryPage(db: 'psycopg2.connection', w: Web, stripeCount: int,
                    stripe: int) -> None:
    assert (w.url is not None and w.created is not None and w.id is not None)
    global storyUrlPrefix
    if not w.url.startswith(storyUrlPrefix):
        return

    url = w.url
    ts = int(w.created / 1000)

    fid = int(url[len(storyUrlPrefix):].split('/')[0])
    cid = int(url[len(storyUrlPrefix):].split('/')[1])

    if fid % stripeCount != stripe:
        return

    dec = enc.decode(w.response, w.url)
    if dec is None:
        raise Exception("unknown encoding")
    html = dec[1]

    deathCode = extractFFNDeathCode(html)
    if deathCode != 0:
        #print(f"  {fid} is dead: {deathCode}")
        return

    #plog(f"{w.url} {len(w.response)}: {fid}/{cid}")

    try:
        # try to grab just the story content
        content = extractContent(html)
        FFNFicContent.upsert(db, fid, cid, w.id, content, stripe)
        #plog(f"{w.url} has content len: {len(content)}")
    except:
        plog(f"{w.url} is broken")
        with open(f"./edump_{fid}_{cid}.html", 'w') as f:
            f.write(html)
        plog(traceback.format_exc())
        raise
Exemple #19
0
def prescrape(scraper: WebScraper, wq: WebQueue) -> Optional[Web]:
    assert (wq.url is not None)
    print(f"url: {wq.url}")
    w = scraper.softScrape(wq.url)
    assert (w.created is not None)
    #print(f"  {w.created} {wq.musty}")
    if wq.musty is not None and w.created < wq.musty:
        print(f"  musty, rescraping")
        w = scraper.scrape(wq.url)
    assert (w.url is not None and w.response is not None)
    print(f"\tresponse size: {len(w.response)}B")
    #print(f"\trequest headers: {w.requestHeaders}")
    #print(f"\tresponse headers: {w.responseHeaders}")

    dec = enc.decode(w.response, w.url)
    if dec is None:
        print("\tunknown encoding")
        return None
    print(f"\tencoding: {dec[0]}")
    html = dec[1]
    print(f"\tdecoded size: {len(html)}B")
    return w
Exemple #20
0
def main(db: 'psycopg2.connection') -> int:
    if len(sys.argv) != 2:
        raise Exception("expected wid")

    wid = int(sys.argv[1])

    some = Web.fetchIdRange(db, wid, wid + 1)
    if len(some) != 1:
        raise Exception("TODO")

    w = some[0]
    if w.response is None or len(w.response) < 1:
        return 0
    assert (w.url is not None)

    dec = enc.decode(w.response, w.url)
    if dec is None:
        raise Exception("unknown encoding")
    html = dec[1]
    print(html)

    return 0
def main(db: 'psycopg2.connection') -> int:
    if len(sys.argv) != 3:
        raise Exception("expected wid range")

    wid_s = int(sys.argv[1])
    wid_e = int(sys.argv[2])

    maxId = Web.maxId(db)

    if wid_s > maxId:
        return 0
    wid_e = min(wid_e, maxId)

    wid_s_s = str(wid_s).zfill(10)
    wid_e_s = str(wid_e).zfill(10)

    xzfname = f"data_{wid_s_s}_{wid_e_s}.tar.xz"
    if wid_e > maxId:
        xzfname = f"data_{wid_s_s}_{wid_e_s}_partial.tar.xz"

    mfname = f"./manifest_{wid_s_s}_{wid_e_s}.tsv"
    if wid_e > maxId:
        mfname = f"./manifest_{wid_s_s}_{wid_e_s}_partial.tsv"

    ffnLike = 'https://www.fanfiction.net/%'

    with tarfile.open(xzfname, 'w:xz') as xzf:
        # compute manifest
        manifest_s = 'id\ttimestamp\turl\tlength\tmd5\n'
        for w in Web.fetchIdRange_g(db,
                                    wid_s,
                                    wid_e,
                                    ulike=ffnLike,
                                    status=200):
            if w.response is None or len(w.response) < 1:
                continue
            assert (w.url is not None and w.created is not None)

            dec = enc.decode(w.response, w.url)
            if dec is None:
                continue
            html = dec[1]

            ts = int(w.created / 1000)
            html = f"<!--\t{ts}\t{w.url}\t-->\n" + html

            h = hashlib.md5(html.encode('utf-8')).hexdigest()
            l = len(html.encode('utf-8'))
            manifest_s += f"{w.id}\t{ts}\t{w.url}\t{l}\t{h}\n"

        # write raw manifest
        with open(mfname, "w") as mf:
            mf.write(manifest_s)

        # save manifest to txz
        s = io.BytesIO(manifest_s.encode('utf-8'))
        ti = tarfile.TarInfo(name=mfname)
        ti.size = len(manifest_s.encode('utf-8'))
        xzf.addfile(tarinfo=ti, fileobj=s)

        # save individual requests to txz
        for w in Web.fetchIdRange_g(db,
                                    wid_s,
                                    wid_e,
                                    ulike=ffnLike,
                                    status=200):
            if w.response is None or len(w.response) < 1:
                continue
            assert (w.url is not None and w.created is not None)

            dec = enc.decode(w.response, w.url)
            if dec is None:
                continue
            html = dec[1]

            ts = int(w.created / 1000)
            html = f"<!--\t{ts}\t{w.url}\t-->\n" + html

            s = io.BytesIO(html.encode('utf-8'))
            ti = tarfile.TarInfo(name=f"./{w.id}.html")
            ti.mtime = int(w.created // 1000)
            ti.size = len(html.encode('utf-8'))
            xzf.addfile(tarinfo=ti, fileobj=s)

    return 0
Exemple #22
0
def prescrapeFandom(db: 'psycopg2.connection',
                    scraper: RemoteWebScraper,
                    fandom: FFNFandom,
                    scrollDate: int,
                    recentThreshold: int,
                    crossover: bool = False) -> None:
    # TODO fandom graveyard? look at community scraper
    assert (fandom.id is not None)

    plog(
        f"prescraping fandom {fandom.id} {fandom.stub}, crossover: {crossover}"
    )
    lastCompleted = FFNFandomDeltaResult.lastCompleted(db,
                                                       fandom.id,
                                                       crossover=crossover)
    if lastCompleted is not None and lastCompleted > recentThreshold:
        plog(f"  completed recently: {lastCompleted} > {recentThreshold}")
        return

    deltaResult = FFNFandomDeltaResult.create(db,
                                              fandom.id,
                                              crossover=crossover)

    page = 1
    pages = 1
    fanMinTs = None
    fanMaxTs = None
    while page <= pages:
        if pages > 1:
            plog(f"  grabbing page {page}/{pages}")
        url = fandom.getUrl(db, page) if not crossover \
          else fandom.getAllCrossoversUrl(page)
        w = scraper.softScrape(url)
        dec = enc.decode(w.response, url)
        if dec is None:
            plog("  {fandom.id} has unknown encoding")
            return
        html = dec[1]
        if len(html) < 1:
            plog(f"  {fandom.id} is freshly dead: 1")
            #minerva.buryCommunity(comm.id, 1, w.created)
            return

        soup = BeautifulSoup(html, 'html5lib')

        pages = getPageCount(db, fandom, soup, crossover)
        page += 1

        ficTs = getFicTimestamps(soup)
        if len(ficTs) == 0:
            break

        minTs = getMinFicTs(ficTs)
        maxTs = getMaxFicTs(ficTs)

        if fanMinTs is None:
            fanMinTs = minTs
        if fanMaxTs is None:
            fanMaxTs = maxTs
        if minTs is not None:
            assert (fanMinTs is not None)
            fanMinTs = min(fanMinTs, minTs)
        if maxTs is not None:
            assert (fanMaxTs is not None)
            fanMaxTs = max(fanMaxTs, maxTs)

        deltaResult.update(db, page - 1, pages, fanMinTs, fanMaxTs)

        if maxTs is not None and maxTs <= scrollDate:
            break

    deltaResult.finish(db, page - 1, pages, fanMinTs, fanMaxTs)
Exemple #23
0
    return categories


categories = getCategories(scraper)
#print(categories)

fandomNameMap: Dict[str, Optional[int]] = {}
fandomIdMap: Dict[int, str] = {}
fandomStubMap: Dict[str, str] = {}

for category in categories:
    url = f"{baseUrl}/{category}/"
    w = scraper.softScrape(url)
    assert (w.url is not None)

    dec = enc.decode(w.response, w.url)
    assert (dec is not None)
    html = dec[1]
    soup = BeautifulSoup(html, 'html5lib')
    for a in soup.findAll('a'):
        href = urllib.parse.urljoin(w.url, a.get('href'))
        href = stripAfter(href, '#')
        href = stripAfter(href, '?')
        if not href.startswith(url):
            continue
        if href == url:
            continue
        fandomName = stripAfter(href[len(url):], '/')
        fandomName = urllib.parse.unquote(fandomName)
        fandomName = f"{category}/{fandomName}"