Ejemplo n.º 1
0
def main(db: 'psycopg2.connection') -> None:
    if len(sys.argv) != 2:
        raise Exception("expected wid")

    wid = int(sys.argv[1])

    some = Web.fetchIdRange(db, wid, wid + 1)
    if len(some) != 1:
        raise Exception("TODO")
    w = some[0]
    assert (w.url is not None and w.created is not None)

    if not w.url.startswith('https://www.fanfiction.net/s/'):
        raise Exception("not a ffn url")

    fid = int(w.url.split('/')[4])
    print(f"fid: {fid}")

    response = w.response
    if response is None and w.wbaseId is not None:
        wbase = WebBase.lookup(db, w.wbaseId)
        if wbase is None:
            raise Exception("has null web_base")
        response = wbase.response

    if response is None or len(response) < 1:
        print("response is null")
        return

    dec = enc.decode(response, w.url)
    if dec is None:
        raise Exception("unknown encoding")
    html = dec[1]

    code = extractFFNDeathCode(html)
    if code != 0:
        plog(f"  dead: {code}")
        c = FFNFic.bury(db, fid, code, w.created, True)
        print(c)
        #print(html)
    else:
        plog(f"  {fid} healthy?")
        print(html)
        try:
            ffnParser = FFNParser()
            ts = int(w.created / 1000)
            fic = ffnParser.get(db, fid, ts, BeautifulSoup(html, 'html5lib'))
            plog(f"{fic.__dict__}")
        except:
            plog(f"{w.url} is broken")
            #with open(f"./edump_{fid}_{cid}.html", 'w') as f:
            #	f.write(html)
            raise
Ejemplo n.º 2
0
def prescrapeFid(db: 'psycopg2.connection', scraper: RemoteWebScraper,
                 fid: int, cid: int) -> None:
    plog(f"prescraping fid {fid} cid {cid}")
    code = FFNFic.isDead(db, fid)
    if code != 0:
        plog(f"  {fid} is dead: {code}")
        return
    url = getUrl(fid, cid)
    w = scraper.softScrape(url)
    dec = enc.decode(w.response, url)
    if dec is None:
        plog("  {fid}/{cid} has unknown encoding")
        return
    html = dec[1]
    code = extractFFNDeathCode(html)
    if code != 0:
        plog(f"  {fid} is freshly dead: {code}")
        FFNFic.bury(db, fid, code)
Ejemplo n.º 3
0
def refreshMeta(db: 'psycopg2.connection', scraper: RemoteWebScraper,
                fid: int) -> int:
    plog(f"  refreshing fid {fid} meta")

    fic = FFNFic.lookup(db, fid)
    if fic is not None and fic.chapterCount is not None:
        plog(f"    old chapterCount: {fic.chapterCount}")

    url = getUrl(fid, 1)
    w = scraper.scrape(url)

    assert (w.url is not None and w.created is not None)

    response = w.response
    if response is None and w.wbaseId is not None:
        wbase = WebBase.lookup(db, w.wbaseId)
        if wbase is None:
            raise Exception("has null web_base")
        response = wbase.response

    if response is None or len(response) < 1:
        raise Exception(f'refreshMeta: unable to find response for {fid}')

    dec = enc.decode(response, w.url)
    if dec is None:
        raise Exception("unknown encoding")
    html = dec[1]

    code = extractFFNDeathCode(html)
    if code != 0:
        plog(f"  dead: {code}")
        c = FFNFic.bury(db, fid, code, w.created, True)
        return code

    try:
        ffnParser = FFNParser()
        ts = int(w.created / 1000)
        pfic = ffnParser.get(db, fid, ts, BeautifulSoup(html, 'html5lib'))
    except:
        raise

    return 0
Ejemplo n.º 4
0
def testLid(db: 'psycopg2.connection', lid: int) -> None:
    url = f'https://www.fanfiction.net/s/{lid}/1'
    scraper = RemoteWebScraper(db)
    w = scraper.softScrape(url)
    assert (w.created is not None)

    dec = enc.decode(w.response, url)
    if dec is None:
        plog("  {url} has unknown encoding")
        sys.exit(1)
    html = dec[1]
    code = extractFFNDeathCode(html)
    if code != 0:
        plog(f"  {url} is freshly dead: {code}")
        return

    soup = BeautifulSoup(html, 'html5lib')
    parser = minerva.ffn.parser.FFNParser()
    fic = parser.get(db, lid, w.created // 1000, soup)
    print(fic.__dict__)
Ejemplo n.º 5
0
def handleStoryPage(db: 'psycopg2.connection', w: Web, stripeCount: int,
                    stripe: int) -> None:
    assert (w.url is not None and w.created is not None and w.id is not None)
    global storyUrlPrefix
    if not w.url.startswith(storyUrlPrefix):
        return

    url = w.url
    ts = int(w.created / 1000)

    fid = int(url[len(storyUrlPrefix):].split('/')[0])
    cid = int(url[len(storyUrlPrefix):].split('/')[1])

    if fid % stripeCount != stripe:
        return

    dec = enc.decode(w.response, w.url)
    if dec is None:
        raise Exception("unknown encoding")
    html = dec[1]

    deathCode = extractFFNDeathCode(html)
    if deathCode != 0:
        #print(f"  {fid} is dead: {deathCode}")
        return

    #plog(f"{w.url} {len(w.response)}: {fid}/{cid}")

    try:
        # try to grab just the story content
        content = extractContent(html)
        FFNFicContent.upsert(db, fid, cid, w.id, content, stripe)
        #plog(f"{w.url} has content len: {len(content)}")
    except:
        plog(f"{w.url} is broken")
        with open(f"./edump_{fid}_{cid}.html", 'w') as f:
            f.write(html)
        plog(traceback.format_exc())
        raise