def run(paths, model, sunset, score_at, rev_scores, skip_scores_before, processes, verbose=False): if score_at == "revision": process_dump = revision_scores(model, sunset, skip_scores_before) elif score_at == "latest": process_dump = latest_scores(model, sunset, skip_scores_before) else: sunset_year = int(sunset.strftime("%Y")) if score_at == "monthly": dates = chain(*(zip([year] * 12, MONTHS) for year in range(START_YEAR, sunset_year + 1))) thresholds = [ mwtypes.Timestamp(str(year) + month + "01000000") for year, month in dates ] elif score_at == "biannually": dates = chain(*(zip([year] * 2, ["01", "07"]) for year in range(START_YEAR, sunset_year + 1))) thresholds = [ mwtypes.Timestamp(str(year) + month + "01000000") for year, month in dates ] elif score_at == "annually": thresholds = [ mwtypes.Timestamp(str(year) + "0101000000") for year in range(START_YEAR, sunset_year + 1) ] else: raise RuntimeError( "{0} is not a valid 'score_at' value".format(score_at)) process_dump = threshold_scores(model, sunset, skip_scores_before, thresholds) results = mwxml.map(process_dump, paths, threads=processes) for page_id, title, rev_id, timestamp, (e, score) in results: if e is not None: logger.error("Error while processing {0}({1}) @ {2}: {3}".format( title, page_id, rev_id, e)) continue weighted_sum = sum(CLASS_WEIGHTS[cls] * score['probability'][cls] for cls in score['probability']) rev_scores.write([ page_id, title, rev_id, timestamp.short_format(), score['prediction'], weighted_sum ]) if verbose: sys.stderr.write(score['prediction'] + " ") sys.stderr.flush() if verbose: sys.stderr.write("\n")
def test_from_page_xml(): page_xml = u""" <page> <title>Foo</title> <ns>0</ns> <id>1</id> <revision> <id>1</id> <timestamp>2004-08-09T09:04:08Z</timestamp> </revision> <revision> <id>2</id> <timestamp>2004-08-10T09:04:08Z</timestamp> </revision> </page> """ dump = Dump.from_page_xml(io.StringIO(page_xml)) # You have a `namespaces`, but it's empty. eq_(dump.site_info.namespaces, None) page = dump.next() eq_(page.title, u"Foo") eq_(page.namespace, 0) eq_(page.id, 1) revision = page.next() eq_(revision.id, 1) eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-09T09:04:08Z")) revision = page.next() eq_(revision.id, 2) eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-10T09:04:08Z"))
def process(self, user, timestamp, data=None): """ Processes a user event. :Parameters: user : `hashable` A hashable value to identify a user (`int` or `str` are OK) timestamp : :class:`mwtypes.Timestamp` The timestamp of the event data : `mixed` Event meta data :Returns: A generator of :class:`~mwsessions.Session` expired after processing the user event. """ event = Event(user, mwtypes.Timestamp(timestamp), self.event_i, data) self.event_i += 1 for user, events in self._clear_expired(event.timestamp): yield Session(user, unpack_events(events)) # Apply revision if event.user in self.active_users: events = self.active_users[event.user] else: events = [] self.active_users[event.user] = events active_session = ActiveSession(event.timestamp, event.i, events) self.recently_active.push(active_session) events.append(event)
def user_blocks(user_text, session): """ Returns a list of blocks for a single user """ logger.debug("Getting blocks for {0}".format(user_text)) doc = session.get(action='query', list='blocks', bkusers=user_text, bkprop=['id', 'timestamp']) return [mwtypes.Timestamp(b['timestamp']) for b in doc['query']['blocks']]
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.DEBUG if args['--debug'] else logging.INFO, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) if args['--class-weight'] is not None: class_weights = dict( map(_parse_class_weight_option, args['--class-weight']) ) global CLASS_WEIGHTS CLASS_WEIGHTS.update(class_weights) paths = args['<dump-file>'] with open(args['--model']) as f: model = Model.load(f) sunset = mwtypes.Timestamp(args['--sunset']) if args['--score-at'] not in SCORE_ATS: raise ValueError("--score-at value {0} not available in {1}" .format(args['--score-at'], SCORE_ATS)) else: score_at = args['--score-at'] if args['--rev-scores'] == "<stdout>": rev_scores = mysqltsv.Writer(sys.stdout, headers=HEADERS) else: rev_scores = mysqltsv.Writer( open(args['--rev-scores'], "w"), headers=HEADERS) if args['--extend'] is None: skip_scores_before = {} else: logger.info("Reading in past scores from {0}".format(args['--extend'])) skip_scores_before = {} rows = mysqltsv.read( open(args['--extend']), types=[int, str, int, mwtypes.Timestamp, str, float]) for row in rows: skip_scores_before[row.page_id] = row.timestamp logger.info("Completed reading scores from old output.") if args['--processes'] == "<cpu count>": processes = cpu_count() else: processes = int(args['--processes']) verbose = args['--verbose'] run(paths, model, sunset, score_at, rev_scores, skip_scores_before, processes, verbose=verbose)
def test_revision(): XML = u""" <revision> <id>233192</id> <timestamp>2001-01-21T02:12:21Z</timestamp> <contributor> <username>RoseParks</username> <id>99</id> </contributor> <comment>*</comment> <minor /> <model>wikitext</model> <format>text/x-wiki</format> <text xml:space="preserve">Text of rev 233192</text> <sha1>8kul9tlwjm9oxgvqzbwuegt9b2830vw</sha1> </revision> """ revision = Revision.from_element(ElementIterator.from_string(XML)) eq_(revision.id, 233192) eq_(revision.timestamp, mwtypes.Timestamp(u"2001-01-21T02:12:21Z")) eq_(revision.user.id, 99) eq_(revision.user.text, u"RoseParks") eq_(revision.comment, u"*") eq_(revision.minor, True) eq_(revision.model, u"wikitext") eq_(revision.format, u"text/x-wiki") eq_(revision.text, u"Text of rev 233192") eq_(revision.sha1, u"8kul9tlwjm9oxgvqzbwuegt9b2830vw") eq_(revision.deleted.text, False) eq_(revision.deleted.comment, False) eq_(revision.deleted.user, False) XML = u""" <revision> <id>233192</id> <timestamp>2001-01-21T02:12:21Z</timestamp> <contributor deleted="deleted"></contributor> <comment deleted="deleted" /> <minor /> <model>wikitext</model> <format>text/x-wiki</format> <text xml:space="preserve" deleted="deleted" /> <sha1>8kul9tlwjm9oxgvqzbwuegt9b2830vw</sha1> </revision> """ revision = Revision.from_element(ElementIterator.from_string(XML)) eq_(revision.user, None) eq_(revision.comment, None) eq_(revision.text, None) eq_(revision.deleted.text, True) eq_(revision.deleted.comment, True) eq_(revision.deleted.user, True)
def test_complete(): f = io.StringIO(SAMPLE_XML) dump = Dump.from_file(f) eq_([0, 1], list(ns.id for ns in dump.site_info.namespaces)) page = dump.next() eq_(page.title, u"Foo") eq_(page.namespace, 0) eq_(page.id, 1) eq_(page.redirect, None) eq_(page.restrictions, []) revision = page.next() eq_(revision.id, 1) eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-09T09:04:08Z")) revision = page.next() eq_(revision.id, 2) eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-10T09:04:08Z")) page = dump.next() assert_is_instance(page, Page) eq_(page.title, u"Bar") eq_(page.namespace, 1) eq_(page.id, 2) eq_(page.redirect, u"Computer accessibility") eq_(page.restrictions, [u"edit=sysop:move=sysop"]) revision = page.next() assert_is_instance(revision, Revision) eq_(revision.id, 3) eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-11T09:04:08Z")) revision = page.next() assert_is_instance(revision, Revision) eq_(revision.id, 4) eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-12T09:04:08Z"))
def extract_text(dump, page_labelings, verbose=False): """ Extracts article text and metadata for labelings from an XML dump. :Parameters: dump : :class:`mwxml.Dump` The XML dump file to extract text & metadata from labelings : `iterable`(`dict`) A collection of labeling events to add text to verbose : `bool` Print dots and stuff :Returns: An `iterator` of labelings augmented with 'page_id', 'rev_id' and 'text'. Note that labelings of articles that can't be looked up will not be included. """ for page in dump: if page.namespace == 0 and page.title in page_labelings: if verbose: sys.stderr.write("\n{0}: ".format(page.title)) sys.stderr.flush() labelings = page_labelings[page.title] last_revision = None for revision in page: while last_revision is not None and \ len(labelings) > 0 and \ revision.timestamp > \ mwtypes.Timestamp(labelings[0]['timestamp']): labeling = labelings.pop() labeling['page_id'] = page.id labeling['rev_id'] = last_revision.id if not_an_article(last_revision.text): labeling['text'] = None else: labeling['text'] = last_revision.text yield labeling if verbose: sys.stderr.write("t") sys.stderr.flush() # Don't update last_revision if the text was deleted if revision.text is not None: last_revision = revision
def test_skipping(): f = io.StringIO(SAMPLE_XML) dump = Dump.from_file(f) page = dump.next() eq_(page.title, u"Foo") eq_(page.namespace, 0) eq_(page.id, 1) page = dump.next() eq_(page.title, u"Bar") eq_(page.namespace, 1) eq_(page.id, 2) revision = page.next() eq_(revision.id, 3) eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-11T09:04:08Z"))
import logging from datetime import datetime import mwtypes from pytz import utc from ...datasources import revision_oriented from ...dependencies import DependentSet from ..feature import Feature MW_REGISTRATION_EPOCH = mwtypes.Timestamp("2006-01-01T00:00:00Z") logger = logging.getLogger(__name__) class Revision(DependentSet): "Represents a revision" def __init__(self, name, revision_datasources): super().__init__(name) self.datasources = revision_datasources self.day_of_week = Feature(name + ".day_of_week", _process_day_of_week, returns=int, depends_on=[revision_datasources.timestamp]) "`int` : the day of week when the edit was made (in UTC)" self.hour_of_day = Feature(name + ".hour_of_day", _process_hour_of_day, returns=int,
def process(self, timestamp_str): return mwtypes.Timestamp(timestamp_str or MW_REGISTRATION_EPOCH)
def from_element(cls, element): id = None timestamp = None user = None user_deleted = False minor = False comment = None comment_deleted = False text = None text_deleted = False str = None sha1 = None parent_id = None model = None format = None for sub_element in element: tag = sub_element.tag if tag == u"id": id = int(sub_element.text) elif tag == u"timestamp": timestamp = mwtypes.Timestamp(sub_element.text) elif tag == u"contributor": user_deleted = sub_element.attr(u'deleted') is not None if not user_deleted: user = User.from_element(sub_element) elif tag == u"minor": minor = True elif tag == u"sha1": sha1 = sub_element.text elif tag == u"parentid": parent_id = sub_element.text elif tag == u"model": model = sub_element.text elif tag == u"format": format = sub_element.text elif tag == u"comment": comment_deleted = sub_element.attr(u'deleted') is not None if not comment_deleted: comment = sub_element.text elif tag == u"text": text_deleted = sub_element.attr(u'deleted') is not None if not text_deleted: text = sub_element.text str = sub_element.attr(u'bytes') else: raise MalformedXML(u"Unexpected tag found when processing " + u"a <revision>: '{0}'".format(tag)) deleted = cls.Deleted(comment=comment_deleted, text=text_deleted, user=user_deleted) return cls(id, timestamp, user=user, minor=minor, str=str, sha1=sha1, parent_id=parent_id, model=model, format=format, comment=comment, text=text, deleted=deleted)
def from_element(cls, element, namespace_map=None): id = None timestamp = None comment = None user = None page = None type = None action = None text = None params = None comment_deleted = None user_deleted = None for sub_element in element: tag = sub_element.tag if tag == "id": id = int(sub_element.text) elif tag == "timestamp": timestamp = mwtypes.Timestamp(sub_element.text) elif tag == "comment": comment_deleted = sub_element.attr('deleted') is not None if not comment_deleted: comment = sub_element.text elif tag == "contributor": user_deleted = sub_element.attr('deleted') is not None if not user_deleted: user = User.from_element(sub_element) elif tag == "logtitle": if sub_element.text is None: namespace = None title = None elif namespace_map is not None: namespace, title = extract_namespace( sub_element.text, namespace_map) else: namespace = None title = element.text page = cls.Page(namespace=namespace, title=title) elif tag == "type": type = sub_element.text elif tag == "action": action = sub_element.text elif tag == "text": logger.warn("A <text> tag was seen in a log item ... ignoring") elif tag == "params": params = sub_element.text else: raise MalformedXML("Unexpected tag found when processing " + "a <logitem>: '{0}'".format(tag)) deleted = cls.Deleted(comment=comment_deleted, user=user_deleted) return cls(id=id, timestamp=timestamp, comment=comment, user=user, page=page, type=type, action=action, text=text, params=params, deleted=deleted)