Ejemplo n.º 1
0
    def get_rctitles(self, since):
        rctitles = set()

        # Items in the recentchanges table are periodically purged according to
        # http://www.mediawiki.org/wiki/Manual:$wgRCMaxAge
        # By default the max age is 90 days: if a larger timespan is requested
        # here, it's very important to warn that the changes are not available
        if selects.oldest_rc_timestamp(self.db) > since:
            raise ShortRecentChangesError()

        rc_params = {
            "list": "recentchanges",
            "rctype": {"new", "log"},
            "rcprop": {"ids", "title", "loginfo"},
            "rcdir": "newer",
            "rcstart": since,
        }
        for change in self.db.query(rc_params):
            if change["type"] == "log":
                # note that pageid in recentchanges corresponds to log_page
                if change["logtype"] == "protect" and change["pageid"] == 0:
                    rctitles.add(change["title"])
            elif change["type"] == "new":
                rctitles.add(change["title"])

        return rctitles
Ejemplo n.º 2
0
    def get_rctitles(self, since):
        rctitles = set()

        # Items in the recentchanges table are periodically purged according to
        # http://www.mediawiki.org/wiki/Manual:$wgRCMaxAge
        # By default the max age is 90 days: if a larger timespan is requested
        # here, it's very important to warn that the changes are not available
        if selects.oldest_rc_timestamp(self.db) > since:
            raise ShortRecentChangesError()

        rc_params = {
            "list": "recentchanges",
            "rctype": {"new", "log"},
            "rcprop": {"ids", "title", "loginfo"},
            "rcdir": "newer",
            "rcstart": since,
        }
        for change in self.db.query(rc_params):
            if change["type"] == "log":
                # note that pageid in recentchanges corresponds to log_page
                if change["logtype"] == "protect" and change["pageid"] == 0:
                    rctitles.add(change["title"])
            elif change["type"] == "new":
                rctitles.add(change["title"])

        return rctitles
Ejemplo n.º 3
0
    def get_rcusers(self, since):
        """
        Find users whose properties may have changed since the last update.

        :param datetime.datetime since: timestamp of the last update
        :returns: a set of user names
        """
        rcusers = set()

        # Items in the recentchanges table are periodically purged according to
        # http://www.mediawiki.org/wiki/Manual:$wgRCMaxAge
        # By default the max age is 90 days: if a larger timespan is requested
        # here, it's very important to warn that the changes are not available
        if selects.oldest_rc_timestamp(self.db) > since:
            raise ShortRecentChangesError()

        rc_params = {
            "list": "recentchanges",
            "rctype": {"edit", "new", "log"},
            "rcprop": {"user", "title", "loginfo"},
            "rcdir": "newer",
            "rcstart": since,
        }
        for change in self.db.query(rc_params):
            # add the performer of the edit, newpage or log entry
            rcusers.add(change["user"])

            # also examine log entries and add target user
            # there should be only three log event types that might change other users:
            #  - newusers (if user A creates account for user B, recent changes list
            #    only user A)
            #  - rights
            if change["type"] == "log" and change["logtype"] in [
                    "newusers", "rights"
            ]:
                # extract target user name
                username = change["title"].split(":", maxsplit=1)[1]
                rcusers.add(username)

        return rcusers
Ejemplo n.º 4
0
    def gen_update(self, since):
        # Items in the recentchanges table are periodically purged according to
        # http://www.mediawiki.org/wiki/Manual:$wgRCMaxAge
        # By default the max age is 90 days: if a larger timespan is requested
        # here, we need to look into the logging table instead of recentchanges.
        rc_oldest = selects.oldest_rc_timestamp(self.db)
        #        if rc_oldest is None or rc_oldest > since:
        #            delete_early, moved, pages = self.get_logpages(since)
        #        else:
        #            delete_early, moved, pages = self.get_rcpages(since)

        # some log events such as suppress/delete are not recorded in the
        # recentchanges table, fetching from logging is bulletproof
        delete_early, moved, pages = self.get_logpages(since)
        if rc_oldest is not None and rc_oldest <= since:
            pages |= self.get_rcpages(since)[2]

        keys = list(pages.keys())

        # Always delete beforehand, otherwise inserts might violate the
        # page_namespace_title unique constraint (for example when an automatic
        # or manual move-over-redirect has been made).
        for pageid in delete_early:
            # move tags first
            yield self.sql["move", "tagged_revision"], {"b_rev_page": pageid}
            # move relevant revisions from the revision table into archive
            yield self.sql["move", "revision"], {"b_rev_page": pageid}
            # deleted page - this will cause cascade deletion in
            # page_props and page_restrictions tables
            yield self.sql["delete", "page"], {"b_page_id": pageid}

        # Update all moved page titles beforehand, exactly in the order they
        # happened on the wiki, otherwise inserts might violate the
        # page_namespace_title unique constraint (for example when a page has
        # been moved multiple times since the last sync).
        for pageid, params in moved:
            title = self.db.Title(params["target_title"])
            yield self.sql["update", "page_name"], {
                "b_page_id": pageid,
                "b_new_namespace": params["target_ns"],
                "b_new_title": title.dbtitle(params["target_ns"]),
            }

        if pages:
            for chunk in ws.utils.iter_chunks(pages,
                                              self.api.max_ids_per_query):
                params = {
                    "action": "query",
                    "pageids": "|".join(str(pageid) for pageid in chunk),
                    "prop": "info|pageprops",
                    "inprop": "protection",
                }
                pages = list(self.api.call_api(params)["pages"].values())

                # ordering of SQL inserts is important for moved pages, but MediaWiki does
                # not return ordered results for the pageids= parameter
                pages.sort(key=lambda page: keys.index(page["pageid"]))

                for page in pages:
                    # deletes first, otherwise edit + move over redirect would fail
                    yield from self.gen_deletes_from_page(page)
                    yield from self.gen_inserts_from_page(page)

        # get_logpages does not include normal edits, so we need to go through list=allpages again
        if rc_oldest is None or rc_oldest > since:
            yield from self.gen_insert()

        # delete recent changes whose pages were deleted
        yield self.sql["delete", "deleted_recentchanges"]
Ejemplo n.º 5
0
    def gen_update(self, since):
        # Items in the recentchanges table are periodically purged according to
        # http://www.mediawiki.org/wiki/Manual:$wgRCMaxAge
        # By default the max age is 90 days: if a larger timespan is requested
        # here, we need to look into the logging table instead of recentchanges.
        rc_oldest = selects.oldest_rc_timestamp(self.db)
        if rc_oldest is None or rc_oldest > since:
            delete_early, moved, pages = self.get_logpages(since)
        else:
            delete_early, moved, pages = self.get_rcpages(since)
        keys = list(pages.keys())

        # Always delete beforehand, otherwise inserts might violate the
        # page_namespace_title unique constraint (for example when an automatic
        # or manual move-over-redirect has been made).
        for pageid in delete_early:
            # move tags first
            yield self.sql["move", "tagged_revision"], {"b_rev_page": pageid}
            # move relevant revisions from the revision table into archive
            yield self.sql["move", "revision"], {"b_rev_page": pageid}
            # deleted page - this will cause cascade deletion in
            # page_props and page_restrictions tables
            yield self.sql["delete", "page"], {"b_page_id": pageid}

        # Update all moved page titles beforehand, exactly in the order they
        # happened on the wiki, otherwise inserts might violate the
        # page_namespace_title unique constraint (for example when a page has
        # been moved multiple times since the last sync).
        for pageid, params in moved:
            title = self.db.Title(params["target_title"])
            yield self.sql["update", "page_name"], {
                    "b_page_id": pageid,
                    "b_new_namespace": params["target_ns"],
                    "b_new_title": title.dbtitle(params["target_ns"]),
                }

        if pages:
            for chunk in ws.utils.iter_chunks(pages, self.api.max_ids_per_query):
                params = {
                    "action": "query",
                    "pageids": "|".join(str(pageid) for pageid in chunk),
                    "prop": "info|pageprops",
                    "inprop": "protection",
                }
                pages = list(self.api.call_api(params)["pages"].values())

                # ordering of SQL inserts is important for moved pages, but MediaWiki does
                # not return ordered results for the pageids= parameter
                pages.sort(key=lambda page: keys.index(page["pageid"]))

                for page in pages:
                    # deletes first, otherwise edit + move over redirect would fail
                    yield from self.gen_deletes_from_page(page)
                    yield from self.gen_inserts_from_page(page)

        # get_logpages does not include normal edits, so we need to go through list=allpages again
        if rc_oldest is None or rc_oldest > since:
            yield from self.gen_insert()

        # delete recent changes whose pages were deleted
        yield self.sql["delete", "deleted_recentchanges"]
Ejemplo n.º 6
0
    def gen_update(self, since):
        # Items in the recentchanges table are periodically purged according to
        # http://www.mediawiki.org/wiki/Manual:$wgRCMaxAge
        # By default the max age is 90 days: if a larger timespan is requested
        # here, it's very important to warn that the changes are not available
        if selects.oldest_rc_timestamp(self.db) > since:
            raise ShortRecentChangesError()

        # users whose properties may have changed since the last update
        rcusers = set()
        # mapping of renamed users
        # (we rely on dict keeping the insertion order, this is a Python 3.7
        # feature: https://stackoverflow.com/a/39980744 )
        renamed_users = {}

        rc_params = {
            "list": "recentchanges",
            "rctype": {"edit", "new", "log"},
            "rcprop": {"user", "title", "loginfo"},
            "rcdir": "newer",
            "rcstart": since,
        }
        for change in self.db.query(rc_params):
            # add the performer of the edit, newpage or log entry
            rcusers.add(change["user"])

            # also examine log entries and add target user
            # there should be only three log event types that might change other users:
            #  - newusers (if user A creates account for user B, recent changes list
            #    only user A)
            #  - rights
            if change["type"] == "log" and change["logtype"] in [
                    "newusers", "rights"
            ]:
                # extract target user name
                username = change["title"].split(":", maxsplit=1)[1]
                rcusers.add(username)
            # collect renamed users
            elif change["type"] == "log" and change["logtype"] == "renameuser":
                olduser = change["logparams"]["olduser"]
                newuser = change["logparams"]["newuser"]
                renamed_users[olduser] = newuser

        # rename before handling rcusers
        for olduser, newuser in renamed_users.items():
            yield self.sql["update", "user"], {
                "b_olduser": olduser,
                "user_name": newuser
            }
            if olduser in rcusers:
                rcusers.remove(olduser)
            rcusers.add(newuser)

        if rcusers:
            for chunk in ws.utils.iter_chunks(rcusers,
                                              self.api.max_ids_per_query):
                list_params = {
                    "list": "users",
                    "ususers": "|".join(chunk),
                    # "groups" is needed just to catch autoconfirmed
                    "usprop": "groups|groupmemberships|editcount|registration",
                }
                for user in self.api.list(list_params):
                    yield from self.gen_inserts_from_user(user)
                    yield from self.gen_deletes_from_user(user)

        # delete expired group memberships
        yield self.db.user_groups.delete().where(
            self.db.user_groups.c.ug_expiry < datetime.datetime.utcnow())