Beispiel #1
0
def scanRedirects(ctx):
    extractor = ctx.extractor
    dbconn = ctx.dbconn
    page_id = extractor.pageIDForCurrentPage()
    redirect = extractor.redirectTitleForCurrentPage()
    if redirect is not None:
        redirect = redirect.replace(" ", "_")
        redirectID = dbreader.pageIDForPageTitle(dbconn, redirect)
        if redirectID is not None:
            dbwriter.storeRedirectForPage(dbconn, page_id, redirectID, doCommit=True)
Beispiel #2
0
def scanLinks(ctx):
    extractor = ctx.extractor
    dbconn = ctx.dbconn
    redirect = extractor.redirectTitleForCurrentPage()
    if redirect is None:
        page_id = extractor.pageIDForCurrentPage()

        ## 1. Get the page ID's for the outgoing links
        link_ids = map(lambda x: dbreader.pageIDForPageTitle(dbconn, x.replace(" ", "_"), doCache=True),  extractor.canonicalLinksForCurrentPage())
        link_ids = filter(lambda x: x is not None, link_ids)
        link_ids = list(set(link_ids))

        ## 2. Add outgoing links for the page to the outgoing links table
        dbwriter.storeInternalLinksForPage(dbconn, page_id, link_ids)