コード例 #1
0
def test_extract_cite_history():
    FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text'])

    FakeExtractor = namedtuple("Extractor", ['extract'])

    class FakePage:
        def __init__(self, id, title):
            self.id = id
            self.title = title
        def __iter__(self):
            return iter([
                FakeRevision(1, Timestamp(1), "id1 id2"),
                FakeRevision(2, Timestamp(2), "id1 id3"),
                FakeRevision(3, Timestamp(3), "id1 id2 id3"),
                FakeRevision(4, Timestamp(4), "id1 id2 id4"),
                FakeRevision(5, Timestamp(5), "id1 id2 id4"),
            ])

    fake_page = FakePage(1, "Title")

    def extract(text):
        return (Identifier('fake', id) for id in text.split(" "))
    extractor = FakeExtractor(extract)

    expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"),
                (1, "Title", 1, Timestamp(1), "fake", "id2"),
                (1, "Title", 4, Timestamp(4), "fake", "id4")]

    citations = list(extract_cite_history(fake_page, [extractor]))
    eq_(len(citations), len(expected))
    for cite in extract_cite_history(fake_page, [extractor]):
        assert cite in expected
コード例 #2
0
def test_age():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp'])

    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        page_creation.metadata: FakeRevisionMetadata(Timestamp(0))
    }
    eq_(solve(age, cache=cache), 10)
コード例 #3
0
def test_seconds_since():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp'])

    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        parent_revision.metadata: FakeRevisionMetadata(Timestamp(1))
    }
    eq_(solve(seconds_since, cache=cache), 9)
コード例 #4
0
 def __iter__(self):
     return iter([
         FakeRevision(1, Timestamp(1), "id1 id2"),
         FakeRevision(2, Timestamp(2), "id1 id3"),
         FakeRevision(3, Timestamp(3), "id1 id2 id3"),
         FakeRevision(4, Timestamp(4), "id1 id2 id4"),
         FakeRevision(5, Timestamp(5), "id1 id2 id4"),
     ])
コード例 #5
0
ファイル: test_api.py プロジェクト: SPQRobin/revscoring
def test_user_info_from_doc():

    doc = {
        "userid": 24278012,
        "name": "Hoardablehotsauce",
        "editcount": 5,
        "registration": "2015-02-28T22:25:37Z",
        "groups": ["*", "user"],
        "implicitgroups": ["*", "user"],
        "blockid": "5752570",
        "blockedby": "Cryptic",
        "blockedbyid": "295294",
        "blockedtimestamp": "2015-02-28T22:43:23Z",
        "blockreason": "{{uw-softerblock}} <!-- Promotional username, "
        "soft block -->",
        "blockexpiry": "infinity",
        "gender": "unknown"
    }

    info = api.APIExtractor.user_info_from_doc(doc)

    eq_(info.name, "Hoardablehotsauce")
    eq_(info.groups, ['*', "user"])
    eq_(info.implicitgroups, ['*', "user"])
    eq_(info.registration, Timestamp("2015-02-28T22:25:37Z"))
    eq_(info.block_id, 5752570)
    eq_(info.blocked_by, "Cryptic")
    eq_(info.blocked_by_id, 295294)
    eq_(info.blocked_timestamp, Timestamp("2015-02-28T22:43:23Z"))
    eq_(info.block_reason,
        "{{uw-softerblock}} <!-- Promotional username, soft block -->")
    eq_(info.block_expiry, "infinity")
    eq_(info.gender, "unknown")

    doc = {
        "userid": 24278012,
        "name": "Hoardablehotsauce",
        "editcount": 5,
        "groups": ["*", "user"],
        "implicitgroups": ["*", "user"],
        "gender": "unknown"
    }

    info = api.APIExtractor.user_info_from_doc(doc)
    eq_(info.registration, None)
    eq_(info.blocked_timestamp, None)

    doc = None
    info = api.APIExtractor.user_info_from_doc(doc)
    eq_(info, None)
コード例 #6
0
def test_seconds_since():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp'])

    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        previous_user_revision.metadata: FakeRevisionMetadata(Timestamp(1))
    }
    eq_(solve(seconds_since, cache=cache), 9)

    # Makes sure we don't crash when there was no previous user revision
    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        previous_user_revision.metadata: None
    }
    eq_(solve(seconds_since, cache=cache), 0)
コード例 #7
0
ファイル: test_api.py プロジェクト: SPQRobin/revscoring
def test_revision_metadata_from_doc():
    doc = {
        "revid": 3456789,
        "parentid": 54678,
        "comment": "Wat?",
        "user": "******",
        "userid": 34567890,
        "timestamp": "2015-01-07T12:23:57Z",
        "page": {
            "pageid": 347,
            "title": "Hats",
            "ns": 0
        }
    }

    metadata = api.APIExtractor.revision_metadata_from_doc(doc)

    eq_(metadata.rev_id, 3456789)
    eq_(metadata.parent_id, 54678)
    eq_(metadata.user_id, 34567890)
    eq_(metadata.user_text, "EpochFail")
    eq_(metadata.timestamp, Timestamp("2015-01-07T12:23:57Z"))
    eq_(metadata.comment, "Wat?")
    eq_(metadata.page_id, 347)
    eq_(metadata.page_namespace, 0)
    eq_(metadata.page_title, "Hats")
コード例 #8
0
def test_hour_of_day():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp'])
    timestamp = Timestamp('2014-09-07T19:55:00Z')
    cache = {
        revision.metadata: FakeRevisionMetadata(timestamp)
    }
    eq_(solve(hour_of_day, cache=cache), 19)
コード例 #9
0
def test_seconds_since():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata",
                                      ['timestamp'])

    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        parent_revision.metadata: FakeRevisionMetadata(Timestamp(1))
    }
    eq_(solve(seconds_since, cache=cache), 9)

    # Make sure we don't error when there is no parent revision
    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        parent_revision.metadata: None
    }
    eq_(solve(seconds_since, cache=cache), 0)
コード例 #10
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    window_size = int(args['--window'])

    revert_radius = int(args['--revert-radius'])

    if args['--sunset'] == "<now>":
        sunset = Timestamp(time.time())
    else:
        sunset = Timestamp(args['--sunset'])

    keep_diff = bool(args['--keep-diff'])
    verbose = bool(args['--verbose'])

    run(read_docs(sys.stdin), window_size, revert_radius, sunset, keep_diff,
        verbose)
コード例 #11
0
def test_age():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata",
                                      ['user_id', 'timestamp'])
    FakeUserInfo = namedtuple("FakeUserInfo", ['registration'])

    cache = {
        revision.metadata: FakeRevisionMetadata(10, Timestamp(10)),
        user.info: FakeUserInfo(Timestamp(0))
    }
    eq_(solve(age, cache=cache), 10)

    cache = {
        revision.metadata: FakeRevisionMetadata(None, Timestamp(10)),
        user.info: FakeUserInfo(Timestamp(0))
    }
    eq_(solve(age, cache=cache), 0)

    cache = {
        revision.metadata: FakeRevisionMetadata(10, Timestamp("20140101010101")),
        user.info: FakeUserInfo(None)
    }
    # Makes sure that old users with no registration are counted appropriately.
    assert solve(age, cache=cache) > 0

    cache = {
        revision.metadata: FakeRevisionMetadata(10, Timestamp(0)),
        user.info: FakeUserInfo(Timestamp(1))
    }
    # Makes sure that imports (revisions made before registration) don't return
    # negative values.
    eq_(solve(age, cache=cache), 0)
コード例 #12
0
    def invisible_at(self, timestamp):
        timestamp = Timestamp(timestamp)
        if self.visible_since is not None:
            self.visible += max(timestamp - self.visible_since, 0)
        else:
            # This happens with diff algorithms that will detect content
            # duplication
            pass

        self.visible_since = None
コード例 #13
0
 def __init__(self, id, name, editcount, registration, groups,
              implicitgroups, emailable, gender, block_id, blocked_by,
              blocked_by_id, blocked_timestamp, block_reason, block_expiry):
     self.id = int(id) if id is not None else None
     self.name = str(name) if name is not None else None
     self.editcount = int(editcount) if editcount is not None else None
     self.registration = Timestamp(registration) \
         if registration is not None else None
     self.groups = groups or []
     self.implicitgroups = implicitgroups or []
     self.emailable = bool(emailable)
     self.gender = str(gender) if gender is not None else None
     self.block_id = int(block_id) if block_id is not None else None
     self.blocked_by = str(blocked_by) if blocked_by is not None else None
     self.blocked_by_id = int(blocked_by_id) \
         if blocked_by_id is not None else None
     self.blocked_timestamp = Timestamp(blocked_timestamp) \
         if blocked_timestamp is not None else None
     self.block_reason = str(block_reason) \
         if block_reason is not None else None
     self.block_expiry = str(block_expiry) \
         if block_expiry is not None else None
コード例 #14
0
def generate_stats(doc, tokens_added, window, sunset):
    revisions_processed = len(window)

    if sunset is None:
        sunset = window[-1][0][
            'timestamp']  # Use the last revision in the window

    seconds_possible = max(Timestamp(sunset) - Timestamp(doc['timestamp']), 0)

    for token in tokens_added:
        non_self_persisted = sum(doc['contributor'] != c
                                 for c in token.revisions)
        non_self_processed = sum(doc['contributor'] != d['contributor']
                                 for d, ts in window)
        yield {
            "token": str(token),
            "persisted": len(token.revisions[1:]),
            "processed": revisions_processed,
            "non_self_persisted": non_self_persisted,
            "non_self_processed": non_self_processed,
            "seconds_visible": token.seconds_visible(sunset),
            "seconds_possible": seconds_possible
        }
コード例 #15
0
ファイル: api.py プロジェクト: 1ec5/revscoring
    def revision_metadata_from_doc(cls, rev_doc):
        if rev_doc is None: return None
        try:
            timestamp = Timestamp(rev_doc.get('timestamp'))
        except ValueError:
            timestamp = None

        return RevisionMetadata(rev_doc.get('revid'), rev_doc.get('parentid'),
                                rev_doc.get('user'), rev_doc.get('userid'),
                                timestamp, rev_doc.get('comment'),
                                rev_doc.get('page', {}).get('pageid'),
                                rev_doc.get('page', {}).get('ns'),
                                rev_doc.get('page', {}).get('title'),
                                rev_doc.get('size'), 'minor' in rev_doc)
コード例 #16
0
 def __init__(self, rev_id, parent_id, user_text, user_id, timestamp,
              comment, page_id, page_namespace, page_title, bytes, minor):
     self.rev_id = int(rev_id) if rev_id is not None else None
     self.parent_id = int(parent_id) if parent_id is not None else None
     self.user_text = str(user_text) if user_text is not None else None
     self.user_id = int(user_id) if user_id is not None else None
     self.timestamp = Timestamp(timestamp) \
         if timestamp is not None else None
     self.comment = str(comment) if comment is not None else None
     self.page_id = int(page_id) if page_id is not None else None
     self.page_namespace = int(page_namespace) \
         if page_namespace is not None else None
     self.page_title = str(page_title) if page_title is not None else None
     self.bytes = int(bytes) if bytes is not None else None
     self.minor = bool(minor)
コード例 #17
0
def read_osm_changes(f, format):
    
    for line in f:
        parts = line.strip().split("\t")
        
        user = parts[0]
        if format == "<unix timestamp>":
            timestamp = Timestamp(float(parts[1]))
        else:
            timestamp = Timestamp.strptime(parts[1], format)
        
        if len(parts) >= 3:
            change_id = parts[2]
        else:
            change_id = None
        
        yield user, timestamp, change_id
コード例 #18
0
def read_user_actions(f, format):
    
    for line in f:
        parts = line.strip().split("\t")
        
        user = parts[0]
        if format == "<unix timestamp>":
            timestamp = Timestamp(float(parts[1]))
        else:
            timestamp = Timestamp.strptime(parts[1], format)
        
        if len(parts) >= 3:
            action = parts[2]
        else:
            action = None
        
        yield user, timestamp, action
コード例 #19
0
ファイル: api.py プロジェクト: 1ec5/revscoring
    def user_info_from_doc(cls, user_doc):
        if user_doc is None: return None
        try:
            registration = Timestamp(user_doc.get('registration'))
        except ValueError:
            registration = None

        return UserInfo(user_doc.get('userid'), user_doc.get('name'),
                        user_doc.get('editcount'), registration,
                        user_doc.get('groups', []),
                        user_doc.get('implicitgroups',
                                     []), "emailable" in user_doc,
                        user_doc.get('gender'), user_doc.get('blockid'),
                        user_doc.get('blockedby'), user_doc.get('blockedbyid'),
                        user_doc.get('blockedtimestamp'),
                        user_doc.get('blockreason'),
                        user_doc.get('blockexpiry'))
コード例 #20
0
 def visible_at(self, timestamp):
     if self.visible_since is None:
         self.visible_since = Timestamp(timestamp)
コード例 #21
0
 def seconds_visible(self, sunset):
     sunset = Timestamp(sunset)
     if self.visible_since != None:
         return self.visible + (sunset - self.visible_since)
     else:
         return self.visible
コード例 #22
0
import sys,os;sys.path.insert(0, os.path.abspath(os.getcwd()))
from mw import Timestamp

# Seconds since Unix Epoch
str(Timestamp(1234567890))
# > '20090213233130'

# Database format
int(Timestamp("20090213233130"))
# > 1234567890

# API format
int(Timestamp("2009-02-13T23:31:30Z"))
# > 1234567890

# Difference in seconds
Timestamp("2009-02-13T23:31:31Z") - Timestamp(1234567890)
# > 1

# strptime and strftime
Timestamp(1234567890).strftime("%Y foobar")
# > '2009 foobar'

str(Timestamp.strptime("2009 derp 10", "%Y derp %m"))
# > '20091001000000'


コード例 #23
0
ファイル: new_users.py プロジェクト: MuhammadShuaib/mwmetrics
def run(db, user_ids, revert_radius, revert_window, session_cutoff):

    print(tsv.encode_row(HEADERS))

    for user_id in user_ids:
        sys.stderr.write("{0}: ".format(user_id))
        row = defaultdict(lambda: 0)
        row['user_id'] = user_id
        row['surviving'] = False # Preliminary value

        user = db.users.get(user_id)
        if user['user_registration'] is None:
            sys.stderr.write("<no registration>\n")
            continue
        registration = Timestamp(user['user_registration'])
        row['user_registration'] = registration.short_format()

        end_of_first_day = registration + 60*60*24 # One day
        end_of_first_week = registration + 60*60*24*7 # One week

        first_week_revisions = db.revisions.query(
            user_id=user_id,
            direction="newer",
            before=end_of_first_week,
            include_page=True
        )

        session_cache = sessions.Cache(cutoff=session_cutoff)
        session_cache.process(user_id, registration,
                              ("registration", registration))

        for rev in first_week_revisions:
            rev_timestamp = Timestamp(rev['rev_timestamp'])
            ns = rev['page_namespace']

            first_day = rev_timestamp <= end_of_first_day

            row['week_revisions'] += 1
            row['day_revisions'] += 1 if first_day else 0

            if rev_timestamp >= registration + TRIAL_PERIOD:
                row['surviving'] = True

            if ns in MAIN_NAMESPACES:
                row['week_main_revisions'] += 1
                row['day_main_revisions'] += 1 if first_day else 0

                revert = reverts.database.check_row(db, rev,
                                                    radius=revert_radius,
                                                    window=revert_window)

                if revert != None: # Reverted edit!
                    print(rev)
                    row['week_reverted_main_revisions'] += 1
                    row['day_reverted_main_revisions'] += 1 if first_day else 0
                    sys.stderr.write("r");sys.stderr.flush()
                else:
                    sys.stderr.write(".");sys.stderr.flush()
            else:
                row['week_wp_revisions'] += 1 if ns in WP_NAMESPACES else 0
                row['day_wp_revisions'] += 1 if first_day and \
                                                ns in WP_NAMESPACES else 0
                row['week_user_revisions'] += 1 if ns in USER_NAMESPACES else 0
                row['day_user_revisions'] += 1 if first_day and \
                                                ns in USER_NAMESPACES else 0
                row['week_talk_revisions'] += 1 if ns in TALK_NAMESPACES else 0
                row['day_talk_revisions'] += 1 if first_day and \
                                                ns in TALK_NAMESPACES else 0
                sys.stderr.write("_");sys.stderr.flush()


            user_sessions = session_cache.process(user_id, rev_timestamp,
                                                  ("edit", rev_timestamp))
            update_row_with_session_metrics(row, user_sessions)

        user_sessions = session_cache.get_active_sessions()
        update_row_with_session_metrics(row, user_sessions)

        sys.stderr.write("\n")
        sys.stdout.write(tsv.encode_row(row, headers=HEADERS))
        sys.stdout.write("\n")
コード例 #24
0
ファイル: user.py プロジェクト: SPQRobin/revscoring
import re

from mw import Timestamp

from ..datasources import revision, user
from .feature import Feature

# Date that registrations started being recorded in MediaWiki
USER_REGISTRATION_EPOCH = Timestamp("20050101000000")


def process_age(user_info, revision_metadata):
    if user_info is None: return 0
    if process_is_anon(revision_metadata):  # Anonymous so age == zero
        return 0
    else:
        registration_delta = revision_metadata.timestamp - \
                (user_info.registration or USER_REGISTRATION_EPOCH)
        return max(registration_delta, 0)


age = Feature("user.age",
              process_age,
              returns=int,
              depends_on=[user.info, revision.metadata])
"""
Represents age of user when the edit was made in seconds.

:Returns:
    int
コード例 #25
0
def run(db, user_ids, revert_radius, revert_window, session_cutoff):

    print(tsv.encode_row(HEADERS))

    for user_id in user_ids:
        sys.stderr.write("{0}: ".format(user_id))
        row = defaultdict(lambda: 0)
        row['user_id'] = user_id
        row['surviving'] = False  # Preliminary value

        user = db.users.get(user_id)
        if user['user_registration'] is None:
            sys.stderr.write("<no registration>\n")
            continue
        registration = Timestamp(user['user_registration'])
        row['user_registration'] = registration.short_format()

        end_of_first_day = registration + 60 * 60 * 24  # One day
        end_of_first_week = registration + 60 * 60 * 24 * 7  # One week

        first_week_revisions = db.revisions.query(user_id=user_id,
                                                  direction="newer",
                                                  before=end_of_first_week,
                                                  include_page=True)

        session_cache = sessions.Cache(cutoff=session_cutoff)
        session_cache.process(user_id, registration,
                              ("registration", registration))

        for rev in first_week_revisions:
            rev_timestamp = Timestamp(rev['rev_timestamp'])
            ns = rev['page_namespace']

            first_day = rev_timestamp <= end_of_first_day

            row['week_revisions'] += 1
            row['day_revisions'] += 1 if first_day else 0

            if rev_timestamp >= registration + TRIAL_PERIOD:
                row['surviving'] = True

            if ns in MAIN_NAMESPACES:
                row['week_main_revisions'] += 1
                row['day_main_revisions'] += 1 if first_day else 0

                revert = reverts.database.check_row(db,
                                                    rev,
                                                    radius=revert_radius,
                                                    window=revert_window)

                if revert != None:  # Reverted edit!
                    print(rev)
                    row['week_reverted_main_revisions'] += 1
                    row['day_reverted_main_revisions'] += 1 if first_day else 0
                    sys.stderr.write("r")
                    sys.stderr.flush()
                else:
                    sys.stderr.write(".")
                    sys.stderr.flush()
            else:
                row['week_wp_revisions'] += 1 if ns in WP_NAMESPACES else 0
                row['day_wp_revisions'] += 1 if first_day and \
                                                ns in WP_NAMESPACES else 0
                row['week_user_revisions'] += 1 if ns in USER_NAMESPACES else 0
                row['day_user_revisions'] += 1 if first_day and \
                                                ns in USER_NAMESPACES else 0
                row['week_talk_revisions'] += 1 if ns in TALK_NAMESPACES else 0
                row['day_talk_revisions'] += 1 if first_day and \
                                                ns in TALK_NAMESPACES else 0
                sys.stderr.write("_")
                sys.stderr.flush()

            user_sessions = session_cache.process(user_id, rev_timestamp,
                                                  ("edit", rev_timestamp))
            update_row_with_session_metrics(row, user_sessions)

        user_sessions = session_cache.get_active_sessions()
        update_row_with_session_metrics(row, user_sessions)

        sys.stderr.write("\n")
        sys.stdout.write(tsv.encode_row(row, headers=HEADERS))
        sys.stdout.write("\n")
def run(db, start_date, end_date, n, t, debug):
	
	# Print some headers
	print(
		"\t".join([
			"user_id",
			"user_name",
			"user_registration",
			"productive",
			"censored"
		])
	)
	
	t_seconds = DAY_SECONDS*t # Convert days to seconds so that we can do some math
	
	# Get relevant users
	users = db.users.query(
		registered_after=start_date, 
		registered_before=end_date
	)
	for user_row in users:
		logger.debug("Processing {0}:".format(str(user_row['user_name'], "utf-8")))
		# Convert user_registration to a useful type
		user_registration = Timestamp(user_row['user_registration'])
		
		# Get all the revisions the user made within time "t" days registration
		revisions = db.revisions.query(
			user_id=user_row['user_id'], 
			before=user_registration + (DAY_SECONDS*t),
			include_page=True
		)
		
		# Count up the productive edits
		productive_edits = 0 
		for rev_row in revisions:
			
			# Convert revision timestamp to a useful type
			rev_timestamp = Timestamp(rev_row['rev_timestamp'])
			
			# Must me a content edit
			if rev_row['page_namespace'] == 0:
				
				# If the revert doesn't happen in 48 hours, it doesn't count
				revert_end_of_life = rev_timestamp + DAY_SECONDS*2
				
				revert = reverts.database.check_row(
					db, 
					rev_row,
					radius = 15, # Reverts can't cross more than 15 revisions
					before = revert_end_of_life
				)
				
				if revert == None: # Not reverted
					productive_edits += 1
					
					if productive_edits >= n: #We're done here
						break
					
				
			
		
		print(
			"\t".join([
				str(user_row['user_id']),
				escape(str(user_row['user_name'], 'utf-8')),
				escape(str(user_row['user_registration'], 'utf-8')),
				str(productive_edits >= n),
				str(time.time() - user_registration.unix() < (2+t)*DAY_SECONDS)
			])
		)
コード例 #27
0
"""
Demonstrates some simple Timestamp operations
"""
from mw import Timestamp

# Seconds since Unix Epoch
str(Timestamp(1234567890))
# > '20090213233130'

# Database format
int(Timestamp("20090213233130"))
# > 1234567890

# API format
int(Timestamp("2009-02-13T23:31:30Z"))
# > 1234567890

# Difference in seconds
Timestamp("2009-02-13T23:31:31Z") - Timestamp(1234567890)
# > 1

# strptime and strftime
Timestamp(1234567890).strftime("%Y foobar")
# > '2009 foobar'

str(Timestamp.strptime("2009 derp 10", "%Y derp %m"))
# > '20091001000000'