def process_args(args): return {'window_size': int(args['--window']), 'revert_radius': int(args['--revert-radius']), 'sunset': Timestamp(args['--sunset']) if args['--sunset'] != "<now>" else Timestamp(time.time()), 'keep_diff': bool(args['--keep-diff'])}
def __init__(self, id=None, name=None, editcount=None, registration=None, groups=None, implicitgroups=None, emailable=None, gender=None, block_id=None, blocked_by=None, blocked_by_id=None, blocked_timestamp=None, block_reason=None, block_expiry=None): self.id = int(id) if id is not None else None self.name = str(name) if name is not None else None self.editcount = int(editcount) if editcount is not None else None self.registration = Timestamp(registration) \ if registration is not None else None self.groups = groups or [] self.implicitgroups = implicitgroups or [] self.emailable = bool(emailable) self.gender = str(gender) if gender is not None else None self.block_id = int(block_id) if block_id is not None else None self.blocked_by = str(blocked_by) if blocked_by is not None else None self.blocked_by_id = int(blocked_by_id) \ if blocked_by_id is not None else None self.blocked_timestamp = Timestamp(blocked_timestamp) \ if blocked_timestamp is not None else None self.block_reason = str(block_reason) \ if block_reason is not None else None self.block_expiry = str(block_expiry) \ if block_expiry is not None else None
def test_age(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), page_creation.metadata: FakeRevisionMetadata(Timestamp(0)) } eq_(solve(age, cache=cache), 10)
def test_revision(): cache = {revision_oriented.revision.timestamp: Timestamp(0)} eq_(solve(revision.day_of_week, cache=cache), 3) # Thursday, Jan 1 1970 cache = {revision_oriented.revision.timestamp: Timestamp(0)} eq_(solve(revision.hour_of_day, cache=cache), 0) # Midnight eq_(pickle.loads(pickle.dumps(revision.day_of_week)), revision.day_of_week) eq_(pickle.loads(pickle.dumps(revision.hour_of_day)), revision.hour_of_day)
def test_page_creation(): cache = { revision_oriented.revision.timestamp: Timestamp(10), revision_oriented.revision.page.creation.timestamp: Timestamp(0) } assert solve(revision.page.creation.seconds_since, cache=cache) == 10 assert (pickle.loads(pickle.dumps(revision.page.creation.seconds_since)) == revision.page.creation.seconds_since)
def test_parent_revision(): cache = { revision_oriented.revision.timestamp: Timestamp(10), revision_oriented.revision.parent.timestamp: Timestamp(0) } eq_(solve(revision.parent.seconds_since, cache=cache), 10) eq_(pickle.loads(pickle.dumps(revision.parent.seconds_since)), revision.parent.seconds_since)
def test_last_user_revision(): cache = { revision_oriented.revision.timestamp: Timestamp(10), revision_oriented.revision.user.last_revision.timestamp: Timestamp(0) } eq_(solve(revision.user.last_revision.seconds_since, cache=cache), 10) cache = { revision_oriented.revision.timestamp: Timestamp(10), revision_oriented.revision.user.last_revision.timestamp: None } eq_(solve(revision.user.last_revision.seconds_since, cache=cache), 0)
def test_last_user_revision(): cache = { revision_oriented.revision.timestamp: Timestamp(10), revision_oriented.revision.user.last_revision.timestamp: Timestamp(0) } assert solve(revision.user.last_revision.seconds_since, cache=cache) == 10 cache = { revision_oriented.revision.timestamp: Timestamp(10), revision_oriented.revision.user.last_revision.timestamp: None } assert solve(revision.user.last_revision.seconds_since, cache=cache) == 0
def test_revision(): cache = {revision_oriented.revision.timestamp: Timestamp(0)} assert solve(revision.day_of_week, cache=cache) == 3 # Thursday, Jan 1 1970 cache = {revision_oriented.revision.timestamp: Timestamp(0)} assert solve(revision.hour_of_day, cache=cache) == 0 # Midnight assert pickle.loads(pickle.dumps( revision.day_of_week)) == revision.day_of_week assert pickle.loads(pickle.dumps( revision.hour_of_day)) == revision.hour_of_day
def token_persistence(rev_doc, tokens_added, window, sunset): if sunset is None: # Use the last revision in the window sunset = Timestamp(window[-1][0]['timestamp']) seconds_possible = max(sunset - Timestamp(rev_doc['timestamp']), 0) return { 'revisions_processed': len(window), 'non_self_processed': sum(rd['user'] != rev_doc['user'] for rd, _ in window), 'seconds_possible': seconds_possible, 'tokens': [td for td in generate_token_docs(rev_doc, tokens_added)] }
def __init__(self, rev_id=None, parent_id=None, user_text=None, user_id=None, timestamp=None, comment=None, page_id=None, page_namespace=None, page_title=None, bytes=None, minor=None): self.rev_id = int(rev_id) if rev_id is not None else None self.parent_id = int(parent_id) if parent_id is not None else None self.user_text = str(user_text) if user_text is not None else None self.user_id = int(user_id) if user_id is not None else None self.timestamp = Timestamp(timestamp) \ if timestamp is not None else None self.comment = str(comment) if comment is not None else None self.page_id = int(page_id) if page_id is not None else None self.page_namespace = int(page_namespace) \ if page_namespace is not None else None self.page_title = str(page_title) if page_title is not None else None self.bytes = int(bytes) if bytes is not None else None self.minor = bool(minor)
def test_seconds_since(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), previous_user_revision.metadata: FakeRevisionMetadata(Timestamp(1)) } eq_(solve(seconds_since, cache=cache), 9) # Makes sure we don't crash when there was no previous user revision cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), previous_user_revision.metadata: None } eq_(solve(seconds_since, cache=cache), 0)
def test_revision_metadata_from_doc(): doc = { "revid": 3456789, "parentid": 54678, "comment": "Wat?", "user": "******", "userid": 34567890, "timestamp": "2015-01-07T12:23:57Z", "page": { "pageid": 347, "title": "Hats", "ns": 0 } } metadata = api.APIExtractor.revision_metadata_from_doc(doc) eq_(metadata.rev_id, 3456789) eq_(metadata.parent_id, 54678) eq_(metadata.user_id, 34567890) eq_(metadata.user_text, "EpochFail") eq_(metadata.timestamp, Timestamp("2015-01-07T12:23:57Z")) eq_(metadata.comment, "Wat?") eq_(metadata.page_id, 347) eq_(metadata.page_namespace, 0) eq_(metadata.page_title, "Hats")
def test_seconds_since(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), parent_revision.metadata: FakeRevisionMetadata(Timestamp(1)) } eq_(solve(seconds_since, cache=cache), 9) # Make sure we don't error when there is no parent revision cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), parent_revision.metadata: None } eq_(solve(seconds_since, cache=cache), 0)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') revert_radius = int(args['--revert-radius']) revert_window = int(args['--revert-window']) * (60 * 60) # secs --> hrs if args['--host']: session = mwapi.Session(args['--host'], user_agent="ORES revert labeling utility") else: session = None dumpf = args['--dump-file'] verbose = args['--verbose'] start = args['--start'] if start: start = Timestamp(start) end = args['--end'] if end: end = Timestamp(end) reverted_only = args['--reverted-only'] trusted_groups = args['--trusted-groups'] if trusted_groups: trusted_groups = trusted_groups.split(',') trusted_users = load_user_group_members(trusted_groups, session) else: trusted_users = None trusted_edits = args['--trusted-edits'] if trusted_edits: trusted_edits = int(trusted_edits) check_blocked = args['--check-blocked'] run(dumpf, session, start, end, revert_radius, revert_window, reverted_only, trusted_users, trusted_edits, check_blocked, verbose=verbose)
def test_log_item(): XML = u""" <logitem> <id>6</id> <timestamp>2004-12-23T03:34:26Z</timestamp> <contributor> <username>Brockert</username> <id>50095</id> </contributor> <comment>content was: '#redirect [[Template:UserBrockert]]', an old experiment of mine, now being moved around by bots</comment> <type>delete</type> <action>delete</action> <logtitle>Template:UserBrockert</logtitle> <params xml:space="preserve" /> </logitem> """ # noqa namespace_map = {u"Template": Namespace(10, u"Template")} log_item = LogItem.from_element(ElementIterator.from_string(XML), namespace_map) eq_(log_item.id, 6) eq_(log_item.timestamp, Timestamp(u"2004-12-23T03:34:26Z")) eq_( log_item.comment, u"content was: '#redirect [[Template:UserBrockert]]', an old " + u"experiment of mine, now being moved around by bots") eq_(log_item.user.id, 50095) eq_(log_item.user.text, u"Brockert") eq_(log_item.page.namespace, 10) eq_(log_item.page.title, u"UserBrockert") eq_(log_item.type, u"delete") eq_(log_item.action, u"delete") eq_(log_item.params, None) eq_(log_item.deleted.action, None) eq_(log_item.deleted.user, False) eq_(log_item.deleted.comment, False) eq_(log_item.deleted.restricted, None) NULL_TITLE_XML = u""" <logitem> <id>6</id> <timestamp>2004-12-23T03:34:26Z</timestamp> <contributor> <username>Brockert</username> <id>50095</id> </contributor> <comment>content was: '#redirect [[Template:UserBrockert]]', an old experiment of mine, now being moved around by bots</comment> <type>delete</type> <action>delete</action> <logtitle /> <params xml:space="preserve" /> </logitem> """ # noqa log_item = LogItem.from_element( ElementIterator.from_string(NULL_TITLE_XML)) eq_(log_item.page.namespace, None) eq_(log_item.page.title, None)
def process_seconds_since(parent_revision_metadata, revision_metadata): revision_timestamp = revision_metadata.timestamp \ if revision_metadata is not None else Timestamp(0) previous_timestamp = parent_revision_metadata.timestamp \ if parent_revision_metadata is not None and \ parent_revision_metadata.timestamp is not None \ else revision_timestamp return revision_timestamp - previous_timestamp
def get_assessment(self, wikitext): ''' Parse the given wikitext and extract any assessment rating. If multiple ratings are present, the highest rating is used. The same approach is used in the research paper below, where a low amount of disagreement was found between using a majority vote and the highest rating. Warncke-Wang, M., Ayukaev, V. R., Hecht, B., and Terveen, L. "The Success and Failure of Quality Improvement Projects in Peer Production Communities", in CSCW 2015. :param wikitext: wikitext of a talk page :returns: assessment rating ''' rating = 'na' ratings = [] # numeric ratings # Helper objects, the wikiclass extractor wants `mwxml.Page' objects Revision = namedtuple("Revisions", ['id', 'timestamp', 'sha1', 'text']) class MWXMLPage: def __init__(self, title, namespace, revisions): self.title = title self.namespace = namespace self.revisions = revisions def __iter__(self): return iter(self.revisions) # NOTE: The assessments are at the top of the page, # and the templates are rather small, # so if the page is > 8k, truncate. if len(wikitext) > 8 * 1024: wikitext = wikitext[:8 * 1024] # Extract rating observations from a dummy `mwxml.Page` object # where the only revision is our wikitext observations = enwiki.extract( MWXMLPage(self.title(), 1, [Revision(1, Timestamp(1), "aaa", wikitext)])) for observation in observations: try: ratings.append(self._wp10_scale[observation['wp10']]) except KeyError: pass # invalid rating if ratings: # set rating to the highest rating, but the str, not ints rating = {v: k for k, v in self._wp10_scale.items()}[max(ratings)] return (rating)
def test_user_registration(): cache = { revision_oriented.revision.timestamp: Timestamp(10), revision_oriented.revision.user.id: 10, revision_oriented.revision.user.info.registration: Timestamp(0) } assert solve(revision.user.seconds_since_registration, cache=cache) == 10 # Anon (no registration) cache = { revision_oriented.revision.timestamp: Timestamp(10), revision_oriented.revision.user.id: 0, revision_oriented.revision.user.info.registration: None } assert solve(revision.user.seconds_since_registration, cache=cache) == 0 # Old user (no registration) cache = { revision_oriented.revision.timestamp: MW_REGISTRATION_EPOCH + 10, revision_oriented.revision.user.id: 10, revision_oriented.revision.user.info.registration: None } assert solve(revision.user.seconds_since_registration, cache=cache) == 10 # Old user (broken registration date) cache = { revision_oriented.revision.timestamp: Timestamp(0), revision_oriented.revision.user.id: 10, revision_oriented.revision.user.info.registration: Timestamp(10) } assert (solve(revision.user.seconds_since_registration, cache=cache) == 60 * 60 * 24 * 365) # one year assert (pickle.loads(pickle.dumps( revision.user.seconds_since_registration)) == revision.user.seconds_since_registration)
def revision_metadata_from_doc(cls, rev_doc): if rev_doc is None: return None try: timestamp = Timestamp(rev_doc.get('timestamp')) except ValueError: timestamp = None return RevisionMetadata(rev_doc.get('revid'), rev_doc.get('parentid'), rev_doc.get('user'), rev_doc.get('userid'), timestamp, rev_doc.get('comment'), rev_doc.get('page', {}).get('pageid'), rev_doc.get('page', {}).get('ns'), rev_doc.get('page', {}).get('title'), rev_doc.get('size'), 'minor' in rev_doc)
def main(): args = docopt.docopt(__doc__) HEADINGS = [ "month", "page_namespace", "reverts", "bot_reverts", "bot_reverteds", "bot2bot_reverts" ] if args['--bots']: bots = {u.strip() for u in open(args['--bots'])} else: bots = None logging.basicConfig( level=logging.WARNING, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') writer = mysqltsv.Writer(sys.stdout, headers=HEADINGS) nmc = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) for doc in read_json_lines(sys.stdin): reverted_username = doc['reverteds'][-1].get('user', {}).get('text') reverting_username = doc['reverting'].get('user', {}).get('text') if reverted_username == reverting_username: continue dbts = Timestamp(doc['reverting']['timestamp']).short_format() month = dbts[:6] + "01" namespace = doc['reverting']['page']['namespace'] nmc[month][namespace]['reverts'] += 1 nmc[month][namespace]['bot_reverts'] += reverting_username in bots nmc[month][namespace]['bot_reverteds'] += reverted_username in bots nmc[month][namespace]['bot2bot_reverts'] += (reverting_username in bots and reverted_username in bots) for month in sorted(nmc.keys()): for page_namespace in sorted(nmc[month].keys()): counts = nmc[month][page_namespace] writer.write([ month, page_namespace, counts['reverts'], counts['bot_reverts'], counts['bot_reverteds'], counts['bot2bot_reverts'] ])
def user_info_from_doc(cls, user_doc): if user_doc is None: return None try: registration = Timestamp(user_doc.get('registration')) except ValueError: registration = None return UserInfo(user_doc.get('userid'), user_doc.get('name'), user_doc.get('editcount'), registration, user_doc.get('groups', []), user_doc.get('implicitgroups', []), "emailable" in user_doc, user_doc.get('gender'), user_doc.get('blockid'), user_doc.get('blockedby'), user_doc.get('blockedbyid'), user_doc.get('blockedtimestamp'), user_doc.get('blockreason'), user_doc.get('blockexpiry'))
def test_age(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['user_id', 'timestamp']) FakeUserInfo = namedtuple("FakeUserInfo", ['registration']) cache = { revision.metadata: FakeRevisionMetadata(10, Timestamp(10)), user.info: FakeUserInfo(Timestamp(0)) } eq_(solve(age, cache=cache), 10) cache = { revision.metadata: FakeRevisionMetadata(None, Timestamp(10)), user.info: FakeUserInfo(Timestamp(0)) } eq_(solve(age, cache=cache), 0) cache = { revision.metadata: FakeRevisionMetadata(10, Timestamp("20140101010101")), user.info: FakeUserInfo(None) } # Makes sure that old users with no registration are counted appropriately. assert solve(age, cache=cache) > 0 cache = { revision.metadata: FakeRevisionMetadata(10, Timestamp(0)), user.info: FakeUserInfo(Timestamp(1)) } # Makes sure that imports (revisions made before registration) don't return # negative values. eq_(solve(age, cache=cache), 0) cache = { revision.metadata: FakeRevisionMetadata(10, Timestamp(0)), user.info: None } eq_(solve(age, cache=cache), 0)
def test_user_info_from_doc(): doc = { "userid": 24278012, "name": "Hoardablehotsauce", "editcount": 5, "registration": "2015-02-28T22:25:37Z", "groups": [ "*", "user" ], "implicitgroups": [ "*", "user" ], "blockid": "5752570", "blockedby": "Cryptic", "blockedbyid": "295294", "blockedtimestamp": "2015-02-28T22:43:23Z", "blockreason": "{{uw-softerblock}} <!-- Promotional username, " "soft block -->", "blockexpiry": "infinity", "gender": "unknown" } info = api.APIExtractor.user_info_from_doc(doc) eq_(info.name, "Hoardablehotsauce") eq_(info.groups, ['*', "user"]) eq_(info.implicitgroups, ['*', "user"]) eq_(info.registration, Timestamp("2015-02-28T22:25:37Z")) eq_(info.block_id, 5752570) eq_(info.blocked_by, "Cryptic") eq_(info.blocked_by_id, 295294) eq_(info.blocked_timestamp, Timestamp("2015-02-28T22:43:23Z")) eq_(info.block_reason, "{{uw-softerblock}} <!-- Promotional username, soft block -->") eq_(info.block_expiry, "infinity") eq_(info.gender, "unknown") doc = { "userid": 24278012, "name": "Hoardablehotsauce", "editcount": 5, "groups": [ "*", "user" ], "implicitgroups": [ "*", "user" ], "gender": "unknown" } info = api.APIExtractor.user_info_from_doc(doc) eq_(info.registration, None) eq_(info.blocked_timestamp, None) doc = None info = api.APIExtractor.user_info_from_doc(doc) eq_(info, None)
def test_extractor(): Revision = namedtuple("Revisions", ['id', 'timestamp', 'sha1', 'text']) class Page: def __init__(self, title, namespace, revisions): self.title = title self.namespace = namespace self.revisions = revisions def __iter__(self): return iter(self.revisions) revisions = [ Revision( 1, Timestamp(0), "aaa", "{{Wikiprojet\n" + "|Seconde Guerre mondiale|maximum\n" + "|Japon|maximum\n" + "|Forces armées des États-Unis|maximum\n" + "|Nucléaire|maximum\n" + "|avancement=e}}" ), Revision( 2, Timestamp(1), "bbb", "{{talk page}}" + "{{Wikiprojet\n" + "|Seconde Guerre mondiale|maximum\n" + "|Japon|maximum\n" + "|Forces armées des États-Unis|maximum\n" + "|Nucléaire|maximum\n" + "|avancement=AdQ}}" ), Revision( 3, Timestamp(2), "aaa", "{{talk page}}" + "{{Wikiprojet\n" + "|Seconde Guerre mondiale|maximum\n" + "|Japon|maximum\n" + "|Forces armées des États-Unis|maximum\n" + "|Nucléaire|maximum\n" + "|avancement=e}}" ), Revision( 4, Timestamp(3), "ccc", "{{talk page}}" + "{{Wikiprojet\n" + "|Seconde Guerre mondiale|maximum\n" + "|Japon|maximum\n" + "|Forces armées des États-Unis|maximum\n" + "|Nucléaire|maximum\n" + "|Sélection transversale|faible\n" + "|avancement=Ébauche}}" ), Revision( 5, Timestamp(4), "aaa", "{{talk page}}" + "{{Wikiprojet\n" + "|Seconde Guerre mondiale|maximum\n" + "|Japon|maximum\n" + "|Forces armées des États-Unis|maximum\n" + "|Nucléaire|maximum\n" + "|avancement=e}}" ), Revision( 6, Timestamp(4), "ccc", "{{talk page}}" + "{{Wikiprojet\n" + "|Seconde Guerre mondiale|maximum\n" + "|Japon|maximum\n" + "|Forces armées des États-Unis|maximum\n" + "|Nucléaire|maximum\n" + "|Sélection transversale|faible\n" + "|avancement=Ébauche}}" ), Revision( 7, Timestamp(5), "ddd", "{{talk page}}" + "{{Wikiprojet\n" + "|Seconde Guerre mondiale|maximum\n" + "|Japon|maximum\n" + "|Forces armées des États-Unis|maximum\n" + "|Nucléaire|maximum\n" + "|Sélection transversale|faible\n" + "|avancement= bd }}" ), Revision( 8, Timestamp(6), "eee", "{{talk page}}" + "{{Wikiprojet\n" + "|Seconde Guerre mondiale|maximum\n" + "|Japon|maximum\n" + "|Forces armées des États-Unis|maximum\n" + "|Nucléaire|maximum\n" + "|Sélection transversale|faible\n" + "|avancement= Bon début }}" ), Revision( 9, Timestamp(6), "eee", "{{talk page}}" + "{{Wikiprojet\n" + "|Seconde Guerre mondiale|maximum\n" + "|Japon|maximum\n" + "|Forces armées des États-Unis|maximum\n" + "|Nucléaire|maximum\n" + "|Sélection transversale|faible\n" + "|avancement= b }}" ), Revision( 10, Timestamp(7), "fff", "{{talk page}}" + "{{Wikiprojet\n" + "|Seconde Guerre mondiale|maximum\n" + "|Japon|maximum\n" + "|Forces armées des États-Unis|maximum\n" + "|Nucléaire|maximum\n" + "|Sélection transversale|faible\n" + "|avancement= a }}" ), Revision( 11, Timestamp(8), "fff", "{{talk page}}" + "{{Wikiprojet\n" + "|Seconde Guerre mondiale|maximum\n" + "|Japon|maximum\n" + "|Forces armées des États-Unis|maximum\n" + "|Nucléaire|maximum\n" + "|Sélection transversale|faible\n" + "|avancement= ba }}" ), Revision( 12, Timestamp(9), "fff", "{{talk page}}" + "{{Wikiprojet\n" + "|Seconde Guerre mondiale|maximum\n" + "|Japon|maximum\n" + "|Forces armées des États-Unis|maximum\n" + "|Nucléaire|maximum\n" + "|Sélection transversale|faible\n" + "|avancement= AdQ }}" ) ] page = Page("Foobar", 1, revisions) observations = frwiki.extract(page) project_labels = {(ob['project'], ob['wp10']): ob for ob in observations} expected = [("wikiprojet", "e", Timestamp(0)), ("wikiprojet", "bd", Timestamp(5)), ("wikiprojet", "b", Timestamp(6)), ("wikiprojet", "a", Timestamp(7)), ("wikiprojet", "ba", Timestamp(8)), ("wikiprojet", "adq", Timestamp(9))] print(project_labels) for proj, lab, timestamp in expected: ob = project_labels[(proj, lab)] assert ob['timestamp'] == timestamp
def test_extractor(): Revision = namedtuple("Revision", ['id', 'timestamp', 'sha1', 'text']) class Page: def __init__(self, title, namespace, revisions): self.title = title self.namespace = namespace self.revisions = revisions def __iter__(self): return iter(self.revisions) pages = [ Page("Page without reverts", 1, [ Revision( 1, Timestamp(1), "aaa", "{{Brasil/Marca|qualidade=1|importância=3}}\n" + "{{Geografia/Marca|qualidade=1|importância=?|rev=20110614}}"), Revision(2, Timestamp(2), "bbb", "{{marca de projeto|rev=20120715|1|Brasil|3}}"), Revision(3, Timestamp(3), "ccc", "{{Classificação/Anfíbios|qualidade=2|importância=1}}"), Revision( 4, Timestamp(4), "ddd", "{{Marca de projeto|qualidade=3|Biografias|4|rev=20140917}}"), Revision( 5, Timestamp(5), "eee", "{{Marca de projeto|qualidade=3||Biografias|2|rev=20151018}}"), Revision( 6, Timestamp(6), "fff", "{{Wikipedia:Projetos/Subdivisões do Brasil/Artigo membro" + "|qualidade=5|importância=2}}"), Revision(7, Timestamp(7), "ggg", "{{Marca de projeto|AB}}"), Revision(8, Timestamp(8), "hhh", "{{Marca de projeto|AD|Biografias|4}}") ]), Page( "Page with single revert", 1, [ Revision(1, Timestamp(1), "aaa", "{{Marca de projeto|2}}"), Revision(2, Timestamp(2), "bbb", "{{Marca de projeto|3}}"), Revision(3, Timestamp(3), "ccc", "{{Marca de projeto|4}}"), Revision( 4, Timestamp(4), "aaa", "{{Marca de projeto|2}}" # Vandal messing up the template ), Revision( 5, Timestamp(5), "ccc", "{{Marca de projeto|4}}" # Patroller reverting vandal ) ]), Page( "Page with overlaping reverts", 1, [ Revision(1, Timestamp(1), "aaa", "{{Marca de projeto|1}}"), Revision(2, Timestamp(2), "bbb", "{{Marca de projeto|2}}"), Revision(3, Timestamp(3), "ccc", "{{Marca de projeto|3}}"), Revision( 4, Timestamp(4), "aaa", "{{Marca de projeto|1}}" # Vandal messing up the template ), Revision( 5, Timestamp(5), "ccc", "{{Marca de projeto|3}}" # Rollback ), Revision( 6, Timestamp(6), "bbb", "{{Marca de projeto|2}}" # Active editor reevaluates the page ), Revision( 7, Timestamp(7), "ddd", "{{Marca de projeto|4}}" # Later on, the page is improved ) ]), Page( "Page with concentric reverts", 1, [ Revision(1, Timestamp(1), "aaa", "{{Marca de projeto|1}}"), Revision(2, Timestamp(2), "bbb", "{{Marca de projeto|2}}"), Revision(3, Timestamp(3), "ccc", "{{Marca de projeto|3}}"), Revision( 4, Timestamp(4), "aaa", "{{Marca de projeto|1}}" # Vandal messing up the template ), Revision( 5, Timestamp(5), "ccc", "{{Marca de projeto|3}}" # Rollback ), Revision(6, Timestamp(6), "ddd", "{{Marca de projeto|2}}<!-- re-evaluation -->") ]) ] expectations = [[("marca de projeto", "1", Timestamp(1)), ("marca de projeto", "2", Timestamp(3)), ("marca de projeto", "3", Timestamp(5)), ("marca de projeto", "5", Timestamp(6)), ("marca de projeto", "6", Timestamp(8))], [("marca de projeto", "2", Timestamp(1)), ("marca de projeto", "3", Timestamp(2)), ("marca de projeto", "4", Timestamp(3))], [("marca de projeto", "1", Timestamp(1)), ("marca de projeto", "2", Timestamp(2)), ("marca de projeto", "4", Timestamp(7))], [("marca de projeto", "1", Timestamp(1)), ("marca de projeto", "2", Timestamp(2)), ("marca de projeto", "3", Timestamp(3)), ("marca de projeto", "2", Timestamp(6))]] for page, expected in zip(pages, expectations): observations = list(ptwiki.extract(page)) lab_tuples = [(ob['project'], ob['wp10'], ob['timestamp']) for ob in observations] assert lab_tuples == expected
def test_extractor(): Revision = namedtuple("Revisions", ['id', 'timestamp', 'sha1', 'text']) class Page: def __init__(self, title, namespace, revisions): self.title = title self.namespace = namespace self.revisions = revisions def __iter__(self): return iter(self.revisions) revisions = [ Revision( 1, Timestamp(0), "aaa", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=IV\n<!-- HTML test comment -->}}"), Revision( 2, Timestamp(1), "bbb", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=III}}"), Revision( 3, Timestamp(2), "aaa", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=IV<!-- HTML test comment -->}}"), Revision( 4, Timestamp(3), "bbb", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=III}}"), Revision( 5, Timestamp(4), "ccc", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=II}}"), Revision( 6, Timestamp(5), "bbb", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=III}}"), Revision( 7, Timestamp(6), "ccc", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=II}}"), Revision( 8, Timestamp(7), "ddd", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=I}}"), Revision( 9, Timestamp(8), "eee", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=ХС}}"), Revision( 10, Timestamp(9), "fff", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=дс}}"), Revision( 11, Timestamp(10), "eee", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=ХС}}"), Revision( 12, Timestamp(11), "fff", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=дс}}"), Revision( 13, Timestamp(12), "ggg", "{{Статья проекта WikiProject\n" + "|важность=высшая\n" + "|уровень=ИС<!-- HTML test comment -->}}") ] page = Page("Foobar", 1, revisions) observations = ruwiki.extract(page) project_labels = {(ob['project'], ob['wp10']): ob for ob in observations} expected = [("wikiproject", "IV", Timestamp(0)), ("wikiproject", "III", Timestamp(1)), ("wikiproject", "II", Timestamp(4)), ("wikiproject", "I", Timestamp(7)), ("wikiproject", "ХС", Timestamp(8)), ("wikiproject", "ДС", Timestamp(9)), ("wikiproject", "ИС", Timestamp(12))] print(project_labels) for proj, lab, timestamp in expected: ob = project_labels[(proj, lab)] assert ob['timestamp'] == timestamp
def diffs2persistence(rev_docs, window_size=50, revert_radius=15, sunset=None, verbose=False): """ Processes a sorted and page-partitioned sequence of revision documents into and adds a 'persistence' field to them containing statistics about how each token "added" in the revision persisted through future revisions. :Parameters: rev_docs : `iterable` ( `dict` ) JSON documents of revision data containing a 'diff' field as generated by ``dump2diffs``. It's assumed that rev_docs are partitioned by page and otherwise in chronological order. window_size : `int` The size of the window of revisions from which persistence data will be generated. revert_radius : `int` The number of revisions back that a revert can reference. sunset : :class:`mwtypes.Timestamp` The date of the database dump we are generating from. This is used to apply a 'time visible' statistic. If not set, now() will be assumed. keep_diff : `bool` Do not drop the `diff` field from the revision document after processing is complete. verbose : `bool` Prints out dots and stuff to stderr :Returns: A generator of rev_docs with a 'persistence' field containing statistics about individual tokens. """ rev_docs = mwxml.utilities.normalize(rev_docs) window_size = int(window_size) revert_radius = int(revert_radius) sunset = Timestamp(sunset) if sunset is not None \ else Timestamp(time.time()) # Group the docs by page page_docs = groupby(rev_docs, key=lambda d: d['page']['title']) for page_title, rev_docs in page_docs: if verbose: sys.stderr.write(page_title + ": ") # We need a look-ahead to know how long this revision was visible rev_docs = peekable(rev_docs) # The window allows us to manage memory window = deque(maxlen=window_size) # The state does the actual processing work state = DiffState(revert_radius=revert_radius) while rev_docs: rev_doc = next(rev_docs) next_doc = rev_docs.peek(None) # Safest to recalculate sha1 rev_doc["sha1"] = sha1(bytes(text, 'utf8', errors="replace")).hexdigest() if next_doc is not None: seconds_visible = Timestamp(next_doc['timestamp']) - \ Timestamp(rev_doc['timestamp']) else: seconds_visible = sunset - Timestamp(rev_doc['timestamp']) if seconds_visible < 0: logger.warn("Seconds visible {0} is less than zero.".format( seconds_visible)) seconds_visible = 0 _, tokens_added, _ = \ state.update_opdocs(rev_doc['sha1'], rev_doc['diff']['ops'], (rev_doc['user'], seconds_visible)) if len(window) == window_size: # Time to start writing some stats old_doc, old_added = window[0] window.append((rev_doc, tokens_added)) persistence = token_persistence(old_doc, old_added, window, None) old_doc['persistence'] = persistence yield old_doc if verbose: sys.stderr.write(".") sys.stderr.flush() else: window.append((rev_doc, tokens_added)) while len(window) > 0: old_doc, old_added = window.popleft() persistence = token_persistence(old_doc, old_added, window, sunset) old_doc['persistence'] = persistence yield old_doc if verbose: sys.stderr.write("_") sys.stderr.flush() if verbose: sys.stderr.write("\n")
from mwtypes import Timestamp from ..datasources import revision, user from .feature import Feature # Date that registrations started being recorded in MediaWiki USER_REGISTRATION_EPOCH = Timestamp("20050101000000") def process_age(user_info, revision_metadata): if user_info is None: return 0 if process_is_anon(revision_metadata): # Anonymous so age == zero return 0 else: registration_delta = revision_metadata.timestamp - \ (user_info.registration or USER_REGISTRATION_EPOCH) return max(registration_delta, 0) age = Feature("user.age", process_age, returns=int, depends_on=[user.info, revision.metadata]) """ Represents age of user when the edit was made in seconds. :Returns: int :Example:
def main(): args = docopt.docopt(__doc__) HEADINGS = [ "rev_id", "rev_timestamp", "rev_user", "rev_user_text", "rev_page", "rev_sha1", "rev_minor_edit", "rev_deleted", "rev_parent_id", "archived", "reverting_id", "reverting_timestamp", "reverting_user", "reverting_user_text", "reverting_page", "reverting_sha1", "reverting_minor_edit", "reverting_deleted", "reverting_parent_id", "reverting_archived", "reverting_comment", "rev_revert_offset", "revisions_reverted", "reverted_to_rev_id", "page_namespace" ] if args['--users']: users = {u.strip() for u in open(args['--users'])} else: users = None writer = mysqltsv.Writer(sys.stdout, headers=HEADINGS) for doc in (json.loads(l) for l in sys.stdin): reverted_username = doc['reverteds'][-1].get('user', {}).get('text') reverting_username = doc['reverting'].get('user', {}).get('text') if reverted_username == reverting_username: continue if users is not None and \ not (reverted_username in users and reverting_username in users): continue writer.write([ doc['reverteds'][-1]['id'], # rev_id Timestamp(doc['reverteds'][-1] ['timestamp']).short_format(), # rev_timestamp doc['reverteds'][-1].get('user', {}).get('id'), # rev_user doc['reverteds'][-1].get('user', {}).get('text'), # rev_user_text doc['reverteds'][-1]['page']['id'], # rev_page doc['reverteds'][-1].get('sha1'), # rev_sha1 doc['reverteds'][-1]['minor'], # rev_minor_edit doc['reverteds'][-1]['deleted']['text'], # rev_deleted doc['reverteds'][-1].get('parent_id'), # rev_parent_id False, # archived doc['reverting']['id'], # reverting_id Timestamp(doc['reverting'] ['timestamp']).short_format(), # reverting_timestamp doc['reverting'].get('user', {}).get('id'), # reverting_user doc['reverting'].get('user', {}).get('text'), # reverting_user_text doc['reverting']['page']['id'], # reverting_page doc['reverting'].get('sha1'), # reverting_sha1 doc['reverting']['minor'], # reverting_minor_edit doc['reverting']['deleted']['text'], # reverting_deleted doc['reverting'].get('parent_id'), # reverting_parent_id False, # reverting_archived doc['reverting'].get('comment', '-'), # reverting_comment len(doc['reverteds']), # rev_revert_offset len(doc['reverteds']), # revisions_reverted doc['reverted_to']['id'], # reverted_to_rev_id doc['reverting']['page']['namespace'] # page_namespace ]) sys.stderr.write(".") sys.stderr.flush() sys.stderr.write("\n")