def scanRedirects(ctx): extractor = ctx.extractor dbconn = ctx.dbconn page_id = extractor.pageIDForCurrentPage() redirect = extractor.redirectTitleForCurrentPage() if redirect is not None: redirect = redirect.replace(" ", "_") redirectID = dbreader.pageIDForPageTitle(dbconn, redirect) if redirectID is not None: dbwriter.storeRedirectForPage(dbconn, page_id, redirectID, doCommit=True)
def scanLinks(ctx): extractor = ctx.extractor dbconn = ctx.dbconn redirect = extractor.redirectTitleForCurrentPage() if redirect is None: page_id = extractor.pageIDForCurrentPage() ## 1. Get the page ID's for the outgoing links link_ids = map(lambda x: dbreader.pageIDForPageTitle(dbconn, x.replace(" ", "_"), doCache=True), extractor.canonicalLinksForCurrentPage()) link_ids = filter(lambda x: x is not None, link_ids) link_ids = list(set(link_ids)) ## 2. Add outgoing links for the page to the outgoing links table dbwriter.storeInternalLinksForPage(dbconn, page_id, link_ids)