Ejemplo n.º 1
0
def process_args(args):
    return {'window_size': int(args['--window']),
            'revert_radius': int(args['--revert-radius']),
            'sunset': Timestamp(args['--sunset'])
                      if args['--sunset'] != "<now>"
                      else Timestamp(time.time()),
            'keep_diff': bool(args['--keep-diff'])}
Ejemplo n.º 2
0
 def __init__(self,
              id=None,
              name=None,
              editcount=None,
              registration=None,
              groups=None,
              implicitgroups=None,
              emailable=None,
              gender=None,
              block_id=None,
              blocked_by=None,
              blocked_by_id=None,
              blocked_timestamp=None,
              block_reason=None,
              block_expiry=None):
     self.id = int(id) if id is not None else None
     self.name = str(name) if name is not None else None
     self.editcount = int(editcount) if editcount is not None else None
     self.registration = Timestamp(registration) \
         if registration is not None else None
     self.groups = groups or []
     self.implicitgroups = implicitgroups or []
     self.emailable = bool(emailable)
     self.gender = str(gender) if gender is not None else None
     self.block_id = int(block_id) if block_id is not None else None
     self.blocked_by = str(blocked_by) if blocked_by is not None else None
     self.blocked_by_id = int(blocked_by_id) \
         if blocked_by_id is not None else None
     self.blocked_timestamp = Timestamp(blocked_timestamp) \
         if blocked_timestamp is not None else None
     self.block_reason = str(block_reason) \
         if block_reason is not None else None
     self.block_expiry = str(block_expiry) \
         if block_expiry is not None else None
Ejemplo n.º 3
0
def test_age():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata",
                                      ['timestamp'])

    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        page_creation.metadata: FakeRevisionMetadata(Timestamp(0))
    }
    eq_(solve(age, cache=cache), 10)
Ejemplo n.º 4
0
def test_revision():

    cache = {revision_oriented.revision.timestamp: Timestamp(0)}
    eq_(solve(revision.day_of_week, cache=cache), 3)  # Thursday, Jan 1 1970

    cache = {revision_oriented.revision.timestamp: Timestamp(0)}
    eq_(solve(revision.hour_of_day, cache=cache), 0)  # Midnight

    eq_(pickle.loads(pickle.dumps(revision.day_of_week)), revision.day_of_week)
    eq_(pickle.loads(pickle.dumps(revision.hour_of_day)), revision.hour_of_day)
def test_page_creation():

    cache = {
        revision_oriented.revision.timestamp: Timestamp(10),
        revision_oriented.revision.page.creation.timestamp: Timestamp(0)
    }
    assert solve(revision.page.creation.seconds_since, cache=cache) == 10

    assert (pickle.loads(pickle.dumps(revision.page.creation.seconds_since)) ==
            revision.page.creation.seconds_since)
Ejemplo n.º 6
0
def test_parent_revision():

    cache = {
        revision_oriented.revision.timestamp: Timestamp(10),
        revision_oriented.revision.parent.timestamp: Timestamp(0)
    }
    eq_(solve(revision.parent.seconds_since, cache=cache), 10)

    eq_(pickle.loads(pickle.dumps(revision.parent.seconds_since)),
        revision.parent.seconds_since)
Ejemplo n.º 7
0
def test_last_user_revision():

    cache = {
        revision_oriented.revision.timestamp: Timestamp(10),
        revision_oriented.revision.user.last_revision.timestamp: Timestamp(0)
    }
    eq_(solve(revision.user.last_revision.seconds_since, cache=cache), 10)

    cache = {
        revision_oriented.revision.timestamp: Timestamp(10),
        revision_oriented.revision.user.last_revision.timestamp: None
    }
    eq_(solve(revision.user.last_revision.seconds_since, cache=cache), 0)
def test_last_user_revision():

    cache = {
        revision_oriented.revision.timestamp: Timestamp(10),
        revision_oriented.revision.user.last_revision.timestamp: Timestamp(0)
    }
    assert solve(revision.user.last_revision.seconds_since, cache=cache) == 10

    cache = {
        revision_oriented.revision.timestamp: Timestamp(10),
        revision_oriented.revision.user.last_revision.timestamp: None
    }
    assert solve(revision.user.last_revision.seconds_since, cache=cache) == 0
def test_revision():

    cache = {revision_oriented.revision.timestamp: Timestamp(0)}
    assert solve(revision.day_of_week,
                 cache=cache) == 3  # Thursday, Jan 1 1970

    cache = {revision_oriented.revision.timestamp: Timestamp(0)}
    assert solve(revision.hour_of_day, cache=cache) == 0  # Midnight

    assert pickle.loads(pickle.dumps(
        revision.day_of_week)) == revision.day_of_week
    assert pickle.loads(pickle.dumps(
        revision.hour_of_day)) == revision.hour_of_day
Ejemplo n.º 10
0
def token_persistence(rev_doc, tokens_added, window, sunset):

    if sunset is None:
        # Use the last revision in the window
        sunset = Timestamp(window[-1][0]['timestamp'])

    seconds_possible = max(sunset - Timestamp(rev_doc['timestamp']), 0)

    return {
        'revisions_processed': len(window),
        'non_self_processed': sum(rd['user'] != rev_doc['user']
                                  for rd, _ in window),
        'seconds_possible': seconds_possible,
        'tokens': [td for td in generate_token_docs(rev_doc, tokens_added)]
    }
Ejemplo n.º 11
0
 def __init__(self,
              rev_id=None,
              parent_id=None,
              user_text=None,
              user_id=None,
              timestamp=None,
              comment=None,
              page_id=None,
              page_namespace=None,
              page_title=None,
              bytes=None,
              minor=None):
     self.rev_id = int(rev_id) if rev_id is not None else None
     self.parent_id = int(parent_id) if parent_id is not None else None
     self.user_text = str(user_text) if user_text is not None else None
     self.user_id = int(user_id) if user_id is not None else None
     self.timestamp = Timestamp(timestamp) \
         if timestamp is not None else None
     self.comment = str(comment) if comment is not None else None
     self.page_id = int(page_id) if page_id is not None else None
     self.page_namespace = int(page_namespace) \
         if page_namespace is not None else None
     self.page_title = str(page_title) if page_title is not None else None
     self.bytes = int(bytes) if bytes is not None else None
     self.minor = bool(minor)
Ejemplo n.º 12
0
def test_seconds_since():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata",
                                      ['timestamp'])

    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        previous_user_revision.metadata: FakeRevisionMetadata(Timestamp(1))
    }
    eq_(solve(seconds_since, cache=cache), 9)

    # Makes sure we don't crash when there was no previous user revision
    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        previous_user_revision.metadata: None
    }
    eq_(solve(seconds_since, cache=cache), 0)
Ejemplo n.º 13
0
def test_revision_metadata_from_doc():
    doc = {
        "revid": 3456789,
        "parentid": 54678,
        "comment": "Wat?",
        "user": "******",
        "userid": 34567890,
        "timestamp": "2015-01-07T12:23:57Z",
        "page": {
            "pageid": 347,
            "title": "Hats",
            "ns": 0
        }
    }

    metadata = api.APIExtractor.revision_metadata_from_doc(doc)

    eq_(metadata.rev_id, 3456789)
    eq_(metadata.parent_id, 54678)
    eq_(metadata.user_id, 34567890)
    eq_(metadata.user_text, "EpochFail")
    eq_(metadata.timestamp, Timestamp("2015-01-07T12:23:57Z"))
    eq_(metadata.comment,  "Wat?")
    eq_(metadata.page_id, 347)
    eq_(metadata.page_namespace, 0)
    eq_(metadata.page_title, "Hats")
Ejemplo n.º 14
0
def test_seconds_since():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata",
                                      ['timestamp'])

    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        parent_revision.metadata: FakeRevisionMetadata(Timestamp(1))
    }
    eq_(solve(seconds_since, cache=cache), 9)

    # Make sure we don't error when there is no parent revision
    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        parent_revision.metadata: None
    }
    eq_(solve(seconds_since, cache=cache), 0)
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    revert_radius = int(args['--revert-radius'])
    revert_window = int(args['--revert-window']) * (60 * 60)  # secs --> hrs

    if args['--host']:
        session = mwapi.Session(args['--host'],
                                user_agent="ORES revert labeling utility")
    else:
        session = None
    dumpf = args['--dump-file']

    verbose = args['--verbose']
    start = args['--start']
    if start:
        start = Timestamp(start)
    end = args['--end']
    if end:
        end = Timestamp(end)
    reverted_only = args['--reverted-only']
    trusted_groups = args['--trusted-groups']
    if trusted_groups:
        trusted_groups = trusted_groups.split(',')
        trusted_users = load_user_group_members(trusted_groups, session)
    else:
        trusted_users = None
    trusted_edits = args['--trusted-edits']
    if trusted_edits:
        trusted_edits = int(trusted_edits)

    check_blocked = args['--check-blocked']
    run(dumpf,
        session,
        start,
        end,
        revert_radius,
        revert_window,
        reverted_only,
        trusted_users,
        trusted_edits,
        check_blocked,
        verbose=verbose)
Ejemplo n.º 16
0
def test_log_item():
    XML = u"""
    <logitem>
        <id>6</id>
        <timestamp>2004-12-23T03:34:26Z</timestamp>
        <contributor>
            <username>Brockert</username>
            <id>50095</id>
        </contributor>
        <comment>content was: '#redirect [[Template:UserBrockert]]', an old experiment of mine, now being moved around by bots</comment>
        <type>delete</type>
        <action>delete</action>
        <logtitle>Template:UserBrockert</logtitle>
        <params xml:space="preserve" />
    </logitem>
    """  # noqa
    namespace_map = {u"Template": Namespace(10, u"Template")}
    log_item = LogItem.from_element(ElementIterator.from_string(XML),
                                    namespace_map)
    eq_(log_item.id, 6)
    eq_(log_item.timestamp, Timestamp(u"2004-12-23T03:34:26Z"))
    eq_(
        log_item.comment,
        u"content was: '#redirect [[Template:UserBrockert]]', an old " +
        u"experiment of mine, now being moved around by bots")
    eq_(log_item.user.id, 50095)
    eq_(log_item.user.text, u"Brockert")
    eq_(log_item.page.namespace, 10)
    eq_(log_item.page.title, u"UserBrockert")
    eq_(log_item.type, u"delete")
    eq_(log_item.action, u"delete")
    eq_(log_item.params, None)
    eq_(log_item.deleted.action, None)
    eq_(log_item.deleted.user, False)
    eq_(log_item.deleted.comment, False)
    eq_(log_item.deleted.restricted, None)

    NULL_TITLE_XML = u"""
    <logitem>
        <id>6</id>
        <timestamp>2004-12-23T03:34:26Z</timestamp>
        <contributor>
            <username>Brockert</username>
            <id>50095</id>
        </contributor>
        <comment>content was: '#redirect [[Template:UserBrockert]]', an old experiment of mine, now being moved around by bots</comment>
        <type>delete</type>
        <action>delete</action>
        <logtitle />
        <params xml:space="preserve" />
    </logitem>
    """  # noqa
    log_item = LogItem.from_element(
        ElementIterator.from_string(NULL_TITLE_XML))
    eq_(log_item.page.namespace, None)
    eq_(log_item.page.title, None)
Ejemplo n.º 17
0
def process_seconds_since(parent_revision_metadata, revision_metadata):

    revision_timestamp = revision_metadata.timestamp \
        if revision_metadata is not None else Timestamp(0)
    previous_timestamp = parent_revision_metadata.timestamp \
        if parent_revision_metadata is not None and \
        parent_revision_metadata.timestamp is not None \
        else revision_timestamp

    return revision_timestamp - previous_timestamp
Ejemplo n.º 18
0
    def get_assessment(self, wikitext):
        '''
        Parse the given wikitext and extract any assessment rating.

        If multiple ratings are present, the highest rating is used.
        The same approach is used in the research paper below, where a low
        amount of disagreement was found between using a majority vote
        and the highest rating.

        Warncke-Wang, M., Ayukaev, V. R., Hecht, B., and Terveen, L.
        "The Success and Failure of Quality Improvement Projects in
        Peer Production Communities", in CSCW 2015.

        :param wikitext: wikitext of a talk page
        :returns: assessment rating
        '''

        rating = 'na'
        ratings = []  # numeric ratings

        # Helper objects, the wikiclass extractor wants `mwxml.Page' objects
        Revision = namedtuple("Revisions", ['id', 'timestamp', 'sha1', 'text'])

        class MWXMLPage:
            def __init__(self, title, namespace, revisions):
                self.title = title
                self.namespace = namespace
                self.revisions = revisions

            def __iter__(self):
                return iter(self.revisions)

        # NOTE: The assessments are at the top of the page,
        # and the templates are rather small,
        # so if the page is > 8k, truncate.
        if len(wikitext) > 8 * 1024:
            wikitext = wikitext[:8 * 1024]

        # Extract rating observations from a dummy `mwxml.Page` object
        # where the only revision is our wikitext
        observations = enwiki.extract(
            MWXMLPage(self.title(), 1,
                      [Revision(1, Timestamp(1), "aaa", wikitext)]))
        for observation in observations:
            try:
                ratings.append(self._wp10_scale[observation['wp10']])
            except KeyError:
                pass  # invalid rating

        if ratings:
            # set rating to the highest rating, but the str, not ints
            rating = {v: k for k, v in self._wp10_scale.items()}[max(ratings)]
        return (rating)
def test_user_registration():

    cache = {
        revision_oriented.revision.timestamp: Timestamp(10),
        revision_oriented.revision.user.id: 10,
        revision_oriented.revision.user.info.registration: Timestamp(0)
    }
    assert solve(revision.user.seconds_since_registration, cache=cache) == 10

    # Anon (no registration)
    cache = {
        revision_oriented.revision.timestamp: Timestamp(10),
        revision_oriented.revision.user.id: 0,
        revision_oriented.revision.user.info.registration: None
    }
    assert solve(revision.user.seconds_since_registration, cache=cache) == 0

    # Old user (no registration)
    cache = {
        revision_oriented.revision.timestamp: MW_REGISTRATION_EPOCH + 10,
        revision_oriented.revision.user.id: 10,
        revision_oriented.revision.user.info.registration: None
    }
    assert solve(revision.user.seconds_since_registration, cache=cache) == 10

    # Old user (broken registration date)
    cache = {
        revision_oriented.revision.timestamp: Timestamp(0),
        revision_oriented.revision.user.id: 10,
        revision_oriented.revision.user.info.registration: Timestamp(10)
    }
    assert (solve(revision.user.seconds_since_registration,
                  cache=cache) == 60 * 60 * 24 * 365)  # one year

    assert (pickle.loads(pickle.dumps(
        revision.user.seconds_since_registration)) ==
            revision.user.seconds_since_registration)
Ejemplo n.º 20
0
    def revision_metadata_from_doc(cls, rev_doc):
        if rev_doc is None:
            return None
        try:
            timestamp = Timestamp(rev_doc.get('timestamp'))
        except ValueError:
            timestamp = None

        return RevisionMetadata(rev_doc.get('revid'), rev_doc.get('parentid'),
                                rev_doc.get('user'), rev_doc.get('userid'),
                                timestamp, rev_doc.get('comment'),
                                rev_doc.get('page', {}).get('pageid'),
                                rev_doc.get('page', {}).get('ns'),
                                rev_doc.get('page', {}).get('title'),
                                rev_doc.get('size'), 'minor' in rev_doc)
Ejemplo n.º 21
0
def main():
    args = docopt.docopt(__doc__)

    HEADINGS = [
        "month", "page_namespace", "reverts", "bot_reverts", "bot_reverteds",
        "bot2bot_reverts"
    ]

    if args['--bots']:
        bots = {u.strip() for u in open(args['--bots'])}
    else:
        bots = None

    logging.basicConfig(
        level=logging.WARNING,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    writer = mysqltsv.Writer(sys.stdout, headers=HEADINGS)

    nmc = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    for doc in read_json_lines(sys.stdin):
        reverted_username = doc['reverteds'][-1].get('user', {}).get('text')
        reverting_username = doc['reverting'].get('user', {}).get('text')
        if reverted_username == reverting_username:
            continue
        dbts = Timestamp(doc['reverting']['timestamp']).short_format()
        month = dbts[:6] + "01"
        namespace = doc['reverting']['page']['namespace']

        nmc[month][namespace]['reverts'] += 1
        nmc[month][namespace]['bot_reverts'] += reverting_username in bots
        nmc[month][namespace]['bot_reverteds'] += reverted_username in bots
        nmc[month][namespace]['bot2bot_reverts'] += (reverting_username in bots
                                                     and reverted_username
                                                     in bots)

    for month in sorted(nmc.keys()):
        for page_namespace in sorted(nmc[month].keys()):
            counts = nmc[month][page_namespace]
            writer.write([
                month, page_namespace, counts['reverts'],
                counts['bot_reverts'], counts['bot_reverteds'],
                counts['bot2bot_reverts']
            ])
Ejemplo n.º 22
0
    def user_info_from_doc(cls, user_doc):
        if user_doc is None:
            return None
        try:
            registration = Timestamp(user_doc.get('registration'))
        except ValueError:
            registration = None

        return UserInfo(user_doc.get('userid'), user_doc.get('name'),
                        user_doc.get('editcount'), registration,
                        user_doc.get('groups', []),
                        user_doc.get('implicitgroups',
                                     []), "emailable" in user_doc,
                        user_doc.get('gender'), user_doc.get('blockid'),
                        user_doc.get('blockedby'), user_doc.get('blockedbyid'),
                        user_doc.get('blockedtimestamp'),
                        user_doc.get('blockreason'),
                        user_doc.get('blockexpiry'))
Ejemplo n.º 23
0
def test_age():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata",
                                      ['user_id', 'timestamp'])
    FakeUserInfo = namedtuple("FakeUserInfo", ['registration'])

    cache = {
        revision.metadata: FakeRevisionMetadata(10, Timestamp(10)),
        user.info: FakeUserInfo(Timestamp(0))
    }
    eq_(solve(age, cache=cache), 10)

    cache = {
        revision.metadata: FakeRevisionMetadata(None, Timestamp(10)),
        user.info: FakeUserInfo(Timestamp(0))
    }
    eq_(solve(age, cache=cache), 0)

    cache = {
        revision.metadata: FakeRevisionMetadata(10,
                                                Timestamp("20140101010101")),
        user.info: FakeUserInfo(None)
    }
    # Makes sure that old users with no registration are counted appropriately.
    assert solve(age, cache=cache) > 0

    cache = {
        revision.metadata: FakeRevisionMetadata(10, Timestamp(0)),
        user.info: FakeUserInfo(Timestamp(1))
    }
    # Makes sure that imports (revisions made before registration) don't return
    # negative values.
    eq_(solve(age, cache=cache), 0)

    cache = {
        revision.metadata: FakeRevisionMetadata(10, Timestamp(0)),
        user.info: None
    }
    eq_(solve(age, cache=cache), 0)
Ejemplo n.º 24
0
def test_user_info_from_doc():

    doc = {
        "userid": 24278012,
        "name": "Hoardablehotsauce",
        "editcount": 5,
        "registration": "2015-02-28T22:25:37Z",
        "groups": [
            "*",
            "user"
        ],
        "implicitgroups": [
            "*",
            "user"
        ],
        "blockid": "5752570",
        "blockedby": "Cryptic",
        "blockedbyid": "295294",
        "blockedtimestamp": "2015-02-28T22:43:23Z",
        "blockreason": "{{uw-softerblock}} <!-- Promotional username, "
                       "soft block -->",
        "blockexpiry": "infinity",
        "gender": "unknown"
    }

    info = api.APIExtractor.user_info_from_doc(doc)

    eq_(info.name, "Hoardablehotsauce")
    eq_(info.groups, ['*', "user"])
    eq_(info.implicitgroups, ['*', "user"])
    eq_(info.registration, Timestamp("2015-02-28T22:25:37Z"))
    eq_(info.block_id, 5752570)
    eq_(info.blocked_by, "Cryptic")
    eq_(info.blocked_by_id, 295294)
    eq_(info.blocked_timestamp, Timestamp("2015-02-28T22:43:23Z"))
    eq_(info.block_reason,
        "{{uw-softerblock}} <!-- Promotional username, soft block -->")
    eq_(info.block_expiry, "infinity")
    eq_(info.gender, "unknown")

    doc = {
        "userid": 24278012,
        "name": "Hoardablehotsauce",
        "editcount": 5,
        "groups": [
            "*",
            "user"
        ],
        "implicitgroups": [
            "*",
            "user"
        ],
        "gender": "unknown"
    }

    info = api.APIExtractor.user_info_from_doc(doc)
    eq_(info.registration, None)
    eq_(info.blocked_timestamp, None)

    doc = None
    info = api.APIExtractor.user_info_from_doc(doc)
    eq_(info, None)
Ejemplo n.º 25
0
def test_extractor():

    Revision = namedtuple("Revisions", ['id', 'timestamp', 'sha1', 'text'])

    class Page:

        def __init__(self, title, namespace, revisions):
            self.title = title
            self.namespace = namespace
            self.revisions = revisions

        def __iter__(self):
            return iter(self.revisions)

    revisions = [
        Revision(
            1, Timestamp(0), "aaa",
            "{{Wikiprojet\n" +
            "|Seconde Guerre mondiale|maximum\n" +
            "|Japon|maximum\n" +
            "|Forces armées des États-Unis|maximum\n" +
            "|Nucléaire|maximum\n" +
            "|avancement=e}}"
        ),
        Revision(
            2, Timestamp(1), "bbb",
            "{{talk page}}" +
            "{{Wikiprojet\n" +
            "|Seconde Guerre mondiale|maximum\n" +
            "|Japon|maximum\n" +
            "|Forces armées des États-Unis|maximum\n" +
            "|Nucléaire|maximum\n" +
            "|avancement=AdQ}}"
        ),
        Revision(
            3, Timestamp(2), "aaa",
            "{{talk page}}" +
            "{{Wikiprojet\n" +
            "|Seconde Guerre mondiale|maximum\n" +
            "|Japon|maximum\n" +
            "|Forces armées des États-Unis|maximum\n" +
            "|Nucléaire|maximum\n" +
            "|avancement=e}}"
        ),
        Revision(
            4, Timestamp(3), "ccc",
            "{{talk page}}" +
            "{{Wikiprojet\n" +
            "|Seconde Guerre mondiale|maximum\n" +
            "|Japon|maximum\n" +
            "|Forces armées des États-Unis|maximum\n" +
            "|Nucléaire|maximum\n" +
            "|Sélection transversale|faible\n" +
            "|avancement=Ébauche}}"
        ),
        Revision(
            5, Timestamp(4), "aaa",
            "{{talk page}}" +
            "{{Wikiprojet\n" +
            "|Seconde Guerre mondiale|maximum\n" +
            "|Japon|maximum\n" +
            "|Forces armées des États-Unis|maximum\n" +
            "|Nucléaire|maximum\n" +
            "|avancement=e}}"
        ),
        Revision(
            6, Timestamp(4), "ccc",
            "{{talk page}}" +
            "{{Wikiprojet\n" +
            "|Seconde Guerre mondiale|maximum\n" +
            "|Japon|maximum\n" +
            "|Forces armées des États-Unis|maximum\n" +
            "|Nucléaire|maximum\n" +
            "|Sélection transversale|faible\n" +
            "|avancement=Ébauche}}"
        ),
        Revision(
            7, Timestamp(5), "ddd",
            "{{talk page}}" +
            "{{Wikiprojet\n" +
            "|Seconde Guerre mondiale|maximum\n" +
            "|Japon|maximum\n" +
            "|Forces armées des États-Unis|maximum\n" +
            "|Nucléaire|maximum\n" +
            "|Sélection transversale|faible\n" +
            "|avancement= bd }}"
        ),
        Revision(
            8, Timestamp(6), "eee",
            "{{talk page}}" +
            "{{Wikiprojet\n" +
            "|Seconde Guerre mondiale|maximum\n" +
            "|Japon|maximum\n" +
            "|Forces armées des États-Unis|maximum\n" +
            "|Nucléaire|maximum\n" +
            "|Sélection transversale|faible\n" +
            "|avancement= Bon début }}"
        ),
        Revision(
            9, Timestamp(6), "eee",
            "{{talk page}}" +
            "{{Wikiprojet\n" +
            "|Seconde Guerre mondiale|maximum\n" +
            "|Japon|maximum\n" +
            "|Forces armées des États-Unis|maximum\n" +
            "|Nucléaire|maximum\n" +
            "|Sélection transversale|faible\n" +
            "|avancement= b }}"
        ),
        Revision(
            10, Timestamp(7), "fff",
            "{{talk page}}" +
            "{{Wikiprojet\n" +
            "|Seconde Guerre mondiale|maximum\n" +
            "|Japon|maximum\n" +
            "|Forces armées des États-Unis|maximum\n" +
            "|Nucléaire|maximum\n" +
            "|Sélection transversale|faible\n" +
            "|avancement= a }}"
        ),
        Revision(
            11, Timestamp(8), "fff",
            "{{talk page}}" +
            "{{Wikiprojet\n" +
            "|Seconde Guerre mondiale|maximum\n" +
            "|Japon|maximum\n" +
            "|Forces armées des États-Unis|maximum\n" +
            "|Nucléaire|maximum\n" +
            "|Sélection transversale|faible\n" +
            "|avancement= ba }}"
        ),
        Revision(
            12, Timestamp(9), "fff",
            "{{talk page}}" +
            "{{Wikiprojet\n" +
            "|Seconde Guerre mondiale|maximum\n" +
            "|Japon|maximum\n" +
            "|Forces armées des États-Unis|maximum\n" +
            "|Nucléaire|maximum\n" +
            "|Sélection transversale|faible\n" +
            "|avancement= AdQ }}"
        )
    ]
    page = Page("Foobar", 1, revisions)

    observations = frwiki.extract(page)
    project_labels = {(ob['project'], ob['wp10']): ob
                      for ob in observations}

    expected = [("wikiprojet", "e", Timestamp(0)),
                ("wikiprojet", "bd", Timestamp(5)),
                ("wikiprojet", "b", Timestamp(6)),
                ("wikiprojet", "a", Timestamp(7)),
                ("wikiprojet", "ba", Timestamp(8)),
                ("wikiprojet", "adq", Timestamp(9))]

    print(project_labels)
    for proj, lab, timestamp in expected:
        ob = project_labels[(proj, lab)]
        assert ob['timestamp'] == timestamp
Ejemplo n.º 26
0
def test_extractor():

    Revision = namedtuple("Revision", ['id', 'timestamp', 'sha1', 'text'])

    class Page:
        def __init__(self, title, namespace, revisions):
            self.title = title
            self.namespace = namespace
            self.revisions = revisions

        def __iter__(self):
            return iter(self.revisions)

    pages = [
        Page("Page without reverts", 1, [
            Revision(
                1, Timestamp(1), "aaa",
                "{{Brasil/Marca|qualidade=1|importância=3}}\n" +
                "{{Geografia/Marca|qualidade=1|importância=?|rev=20110614}}"),
            Revision(2, Timestamp(2), "bbb",
                     "{{marca de projeto|rev=20120715|1|Brasil|3}}"),
            Revision(3, Timestamp(3), "ccc",
                     "{{Classificação/Anfíbios|qualidade=2|importância=1}}"),
            Revision(
                4, Timestamp(4), "ddd",
                "{{Marca de projeto|qualidade=3|Biografias|4|rev=20140917}}"),
            Revision(
                5, Timestamp(5), "eee",
                "{{Marca de projeto|qualidade=3||Biografias|2|rev=20151018}}"),
            Revision(
                6, Timestamp(6), "fff",
                "{{Wikipedia:Projetos/Subdivisões do Brasil/Artigo membro" +
                "|qualidade=5|importância=2}}"),
            Revision(7, Timestamp(7), "ggg", "{{Marca de projeto|AB}}"),
            Revision(8, Timestamp(8), "hhh",
                     "{{Marca de projeto|AD|Biografias|4}}")
        ]),
        Page(
            "Page with single revert",
            1,
            [
                Revision(1, Timestamp(1), "aaa", "{{Marca de projeto|2}}"),
                Revision(2, Timestamp(2), "bbb", "{{Marca de projeto|3}}"),
                Revision(3, Timestamp(3), "ccc", "{{Marca de projeto|4}}"),
                Revision(
                    4,
                    Timestamp(4),
                    "aaa",
                    "{{Marca de projeto|2}}"  # Vandal messing up the template
                ),
                Revision(
                    5,
                    Timestamp(5),
                    "ccc",
                    "{{Marca de projeto|4}}"  # Patroller reverting vandal
                )
            ]),
        Page(
            "Page with overlaping reverts",
            1,
            [
                Revision(1, Timestamp(1), "aaa", "{{Marca de projeto|1}}"),
                Revision(2, Timestamp(2), "bbb", "{{Marca de projeto|2}}"),
                Revision(3, Timestamp(3), "ccc", "{{Marca de projeto|3}}"),
                Revision(
                    4,
                    Timestamp(4),
                    "aaa",
                    "{{Marca de projeto|1}}"  # Vandal messing up the template
                ),
                Revision(
                    5,
                    Timestamp(5),
                    "ccc",
                    "{{Marca de projeto|3}}"  # Rollback
                ),
                Revision(
                    6,
                    Timestamp(6),
                    "bbb",
                    "{{Marca de projeto|2}}"  # Active editor reevaluates the page
                ),
                Revision(
                    7,
                    Timestamp(7),
                    "ddd",
                    "{{Marca de projeto|4}}"  # Later on, the page is improved
                )
            ]),
        Page(
            "Page with concentric reverts",
            1,
            [
                Revision(1, Timestamp(1), "aaa", "{{Marca de projeto|1}}"),
                Revision(2, Timestamp(2), "bbb", "{{Marca de projeto|2}}"),
                Revision(3, Timestamp(3), "ccc", "{{Marca de projeto|3}}"),
                Revision(
                    4,
                    Timestamp(4),
                    "aaa",
                    "{{Marca de projeto|1}}"  # Vandal messing up the template
                ),
                Revision(
                    5,
                    Timestamp(5),
                    "ccc",
                    "{{Marca de projeto|3}}"  # Rollback
                ),
                Revision(6, Timestamp(6), "ddd",
                         "{{Marca de projeto|2}}<!-- re-evaluation -->")
            ])
    ]
    expectations = [[("marca de projeto", "1", Timestamp(1)),
                     ("marca de projeto", "2", Timestamp(3)),
                     ("marca de projeto", "3", Timestamp(5)),
                     ("marca de projeto", "5", Timestamp(6)),
                     ("marca de projeto", "6", Timestamp(8))],
                    [("marca de projeto", "2", Timestamp(1)),
                     ("marca de projeto", "3", Timestamp(2)),
                     ("marca de projeto", "4", Timestamp(3))],
                    [("marca de projeto", "1", Timestamp(1)),
                     ("marca de projeto", "2", Timestamp(2)),
                     ("marca de projeto", "4", Timestamp(7))],
                    [("marca de projeto", "1", Timestamp(1)),
                     ("marca de projeto", "2", Timestamp(2)),
                     ("marca de projeto", "3", Timestamp(3)),
                     ("marca de projeto", "2", Timestamp(6))]]
    for page, expected in zip(pages, expectations):
        observations = list(ptwiki.extract(page))
        lab_tuples = [(ob['project'], ob['wp10'], ob['timestamp'])
                      for ob in observations]
        assert lab_tuples == expected
Ejemplo n.º 27
0
def test_extractor():

    Revision = namedtuple("Revisions", ['id', 'timestamp', 'sha1', 'text'])

    class Page:
        def __init__(self, title, namespace, revisions):
            self.title = title
            self.namespace = namespace
            self.revisions = revisions

        def __iter__(self):
            return iter(self.revisions)

    revisions = [
        Revision(
            1, Timestamp(0), "aaa",
            "{{Статья проекта WikiProject\n" + "|важность=высшая\n" +
            "|уровень=IV\n<!-- HTML test comment -->}}"),
        Revision(
            2, Timestamp(1), "bbb", "{{Статья проекта WikiProject\n" +
            "|важность=высшая\n" + "|уровень=III}}"),
        Revision(
            3, Timestamp(2), "aaa", "{{Статья проекта WikiProject\n" +
            "|важность=высшая\n" + "|уровень=IV<!-- HTML test comment -->}}"),
        Revision(
            4, Timestamp(3), "bbb", "{{Статья проекта WikiProject\n" +
            "|важность=высшая\n" + "|уровень=III}}"),
        Revision(
            5, Timestamp(4), "ccc", "{{Статья проекта WikiProject\n" +
            "|важность=высшая\n" + "|уровень=II}}"),
        Revision(
            6, Timestamp(5), "bbb", "{{Статья проекта WikiProject\n" +
            "|важность=высшая\n" + "|уровень=III}}"),
        Revision(
            7, Timestamp(6), "ccc", "{{Статья проекта WikiProject\n" +
            "|важность=высшая\n" + "|уровень=II}}"),
        Revision(
            8, Timestamp(7), "ddd", "{{Статья проекта WikiProject\n" +
            "|важность=высшая\n" + "|уровень=I}}"),
        Revision(
            9, Timestamp(8), "eee", "{{Статья проекта WikiProject\n" +
            "|важность=высшая\n" + "|уровень=ХС}}"),
        Revision(
            10, Timestamp(9), "fff", "{{Статья проекта WikiProject\n" +
            "|важность=высшая\n" + "|уровень=дс}}"),
        Revision(
            11, Timestamp(10), "eee", "{{Статья проекта WikiProject\n" +
            "|важность=высшая\n" + "|уровень=ХС}}"),
        Revision(
            12, Timestamp(11), "fff", "{{Статья проекта WikiProject\n" +
            "|важность=высшая\n" + "|уровень=дс}}"),
        Revision(
            13, Timestamp(12), "ggg", "{{Статья проекта WikiProject\n" +
            "|важность=высшая\n" + "|уровень=ИС<!-- HTML test comment -->}}")
    ]
    page = Page("Foobar", 1, revisions)

    observations = ruwiki.extract(page)
    project_labels = {(ob['project'], ob['wp10']): ob for ob in observations}

    expected = [("wikiproject", "IV", Timestamp(0)),
                ("wikiproject", "III", Timestamp(1)),
                ("wikiproject", "II", Timestamp(4)),
                ("wikiproject", "I", Timestamp(7)),
                ("wikiproject", "ХС", Timestamp(8)),
                ("wikiproject", "ДС", Timestamp(9)),
                ("wikiproject", "ИС", Timestamp(12))]

    print(project_labels)
    for proj, lab, timestamp in expected:
        ob = project_labels[(proj, lab)]
        assert ob['timestamp'] == timestamp
def diffs2persistence(rev_docs,
                      window_size=50,
                      revert_radius=15,
                      sunset=None,
                      verbose=False):
    """
    Processes a sorted and page-partitioned sequence of revision documents into
    and adds a 'persistence' field to them containing statistics about how each
    token "added" in the revision persisted through future revisions.

    :Parameters:
        rev_docs : `iterable` ( `dict` )
            JSON documents of revision data containing a 'diff' field as
            generated by ``dump2diffs``.  It's assumed that rev_docs are
            partitioned by page and otherwise in chronological order.
        window_size : `int`
            The size of the window of revisions from which persistence data
            will be generated.
        revert_radius : `int`
            The number of revisions back that a revert can reference.
        sunset : :class:`mwtypes.Timestamp`
            The date of the database dump we are generating from.  This is
            used to apply a 'time visible' statistic.  If not set, now() will
            be assumed.
        keep_diff : `bool`
            Do not drop the `diff` field from the revision document after
            processing is complete.
        verbose : `bool`
            Prints out dots and stuff to stderr

    :Returns:
        A generator of rev_docs with a 'persistence' field containing
        statistics about individual tokens.
    """
    rev_docs = mwxml.utilities.normalize(rev_docs)
    window_size = int(window_size)
    revert_radius = int(revert_radius)
    sunset = Timestamp(sunset) if sunset is not None \
                               else Timestamp(time.time())

    # Group the docs by page
    page_docs = groupby(rev_docs, key=lambda d: d['page']['title'])

    for page_title, rev_docs in page_docs:

        if verbose:
            sys.stderr.write(page_title + ": ")

        # We need a look-ahead to know how long this revision was visible
        rev_docs = peekable(rev_docs)

        # The window allows us to manage memory
        window = deque(maxlen=window_size)

        # The state does the actual processing work
        state = DiffState(revert_radius=revert_radius)

        while rev_docs:
            rev_doc = next(rev_docs)
            next_doc = rev_docs.peek(None)

            # Safest to recalculate sha1
            rev_doc["sha1"] = sha1(bytes(text, 'utf8',
                                         errors="replace")).hexdigest()

            if next_doc is not None:
                seconds_visible = Timestamp(next_doc['timestamp']) - \
                                  Timestamp(rev_doc['timestamp'])
            else:
                seconds_visible = sunset - Timestamp(rev_doc['timestamp'])

            if seconds_visible < 0:
                logger.warn("Seconds visible {0} is less than zero.".format(
                    seconds_visible))
                seconds_visible = 0

            _, tokens_added, _ = \
                state.update_opdocs(rev_doc['sha1'], rev_doc['diff']['ops'],
                                    (rev_doc['user'], seconds_visible))

            if len(window) == window_size:
                # Time to start writing some stats
                old_doc, old_added = window[0]
                window.append((rev_doc, tokens_added))
                persistence = token_persistence(old_doc, old_added, window,
                                                None)
                old_doc['persistence'] = persistence
                yield old_doc
                if verbose:
                    sys.stderr.write(".")
                    sys.stderr.flush()
            else:
                window.append((rev_doc, tokens_added))

        while len(window) > 0:
            old_doc, old_added = window.popleft()
            persistence = token_persistence(old_doc, old_added, window, sunset)
            old_doc['persistence'] = persistence
            yield old_doc
            if verbose:
                sys.stderr.write("_")
                sys.stderr.flush()

        if verbose:
            sys.stderr.write("\n")
Ejemplo n.º 29
0
from mwtypes import Timestamp

from ..datasources import revision, user
from .feature import Feature

# Date that registrations started being recorded in MediaWiki
USER_REGISTRATION_EPOCH = Timestamp("20050101000000")


def process_age(user_info, revision_metadata):
    if user_info is None:
        return 0
    if process_is_anon(revision_metadata):  # Anonymous so age == zero
        return 0
    else:
        registration_delta = revision_metadata.timestamp - \
            (user_info.registration or USER_REGISTRATION_EPOCH)
        return max(registration_delta, 0)


age = Feature("user.age",
              process_age,
              returns=int,
              depends_on=[user.info, revision.metadata])
"""
Represents age of user when the edit was made in seconds.

:Returns:
    int

:Example:
Ejemplo n.º 30
0
def main():
    args = docopt.docopt(__doc__)

    HEADINGS = [
        "rev_id", "rev_timestamp", "rev_user", "rev_user_text", "rev_page",
        "rev_sha1", "rev_minor_edit", "rev_deleted", "rev_parent_id",
        "archived", "reverting_id", "reverting_timestamp", "reverting_user",
        "reverting_user_text", "reverting_page", "reverting_sha1",
        "reverting_minor_edit", "reverting_deleted", "reverting_parent_id",
        "reverting_archived", "reverting_comment", "rev_revert_offset",
        "revisions_reverted", "reverted_to_rev_id", "page_namespace"
    ]

    if args['--users']:
        users = {u.strip() for u in open(args['--users'])}
    else:
        users = None

    writer = mysqltsv.Writer(sys.stdout, headers=HEADINGS)

    for doc in (json.loads(l) for l in sys.stdin):
        reverted_username = doc['reverteds'][-1].get('user', {}).get('text')
        reverting_username = doc['reverting'].get('user', {}).get('text')
        if reverted_username == reverting_username:
            continue
        if users is not None and \
           not (reverted_username in users and reverting_username in users):
            continue

        writer.write([
            doc['reverteds'][-1]['id'],  # rev_id
            Timestamp(doc['reverteds'][-1]
                      ['timestamp']).short_format(),  # rev_timestamp
            doc['reverteds'][-1].get('user', {}).get('id'),  # rev_user
            doc['reverteds'][-1].get('user', {}).get('text'),  # rev_user_text
            doc['reverteds'][-1]['page']['id'],  # rev_page
            doc['reverteds'][-1].get('sha1'),  # rev_sha1
            doc['reverteds'][-1]['minor'],  # rev_minor_edit
            doc['reverteds'][-1]['deleted']['text'],  # rev_deleted
            doc['reverteds'][-1].get('parent_id'),  # rev_parent_id
            False,  # archived
            doc['reverting']['id'],  # reverting_id
            Timestamp(doc['reverting']
                      ['timestamp']).short_format(),  # reverting_timestamp
            doc['reverting'].get('user', {}).get('id'),  # reverting_user
            doc['reverting'].get('user',
                                 {}).get('text'),  # reverting_user_text
            doc['reverting']['page']['id'],  #  reverting_page
            doc['reverting'].get('sha1'),  # reverting_sha1
            doc['reverting']['minor'],  # reverting_minor_edit
            doc['reverting']['deleted']['text'],  # reverting_deleted
            doc['reverting'].get('parent_id'),  # reverting_parent_id
            False,  # reverting_archived
            doc['reverting'].get('comment', '-'),  # reverting_comment
            len(doc['reverteds']),  # rev_revert_offset
            len(doc['reverteds']),  # revisions_reverted
            doc['reverted_to']['id'],  # reverted_to_rev_id
            doc['reverting']['page']['namespace']  # page_namespace
        ])
        sys.stderr.write(".")
        sys.stderr.flush()

    sys.stderr.write("\n")