Exemple #1
0
def run(paths,
        model,
        sunset,
        score_at,
        rev_scores,
        skip_scores_before,
        processes,
        verbose=False):

    if score_at == "revision":
        process_dump = revision_scores(model, sunset, skip_scores_before)
    elif score_at == "latest":
        process_dump = latest_scores(model, sunset, skip_scores_before)
    else:
        sunset_year = int(sunset.strftime("%Y"))
        if score_at == "monthly":
            dates = chain(*(zip([year] * 12, MONTHS)
                            for year in range(START_YEAR, sunset_year + 1)))
            thresholds = [
                mwtypes.Timestamp(str(year) + month + "01000000")
                for year, month in dates
            ]
        elif score_at == "biannually":
            dates = chain(*(zip([year] * 2, ["01", "07"])
                            for year in range(START_YEAR, sunset_year + 1)))
            thresholds = [
                mwtypes.Timestamp(str(year) + month + "01000000")
                for year, month in dates
            ]
        elif score_at == "annually":
            thresholds = [
                mwtypes.Timestamp(str(year) + "0101000000")
                for year in range(START_YEAR, sunset_year + 1)
            ]
        else:
            raise RuntimeError(
                "{0} is not a valid 'score_at' value".format(score_at))
        process_dump = threshold_scores(model, sunset, skip_scores_before,
                                        thresholds)

    results = mwxml.map(process_dump, paths, threads=processes)
    for page_id, title, rev_id, timestamp, (e, score) in results:
        if e is not None:
            logger.error("Error while processing {0}({1}) @ {2}: {3}".format(
                title, page_id, rev_id, e))
            continue

        weighted_sum = sum(CLASS_WEIGHTS[cls] * score['probability'][cls]
                           for cls in score['probability'])
        rev_scores.write([
            page_id, title, rev_id,
            timestamp.short_format(), score['prediction'], weighted_sum
        ])

        if verbose:
            sys.stderr.write(score['prediction'] + " ")
            sys.stderr.flush()

    if verbose:
        sys.stderr.write("\n")
Exemple #2
0
def test_from_page_xml():
    page_xml = u"""
    <page>
      <title>Foo</title>
      <ns>0</ns>
      <id>1</id>
      <revision>
        <id>1</id>
        <timestamp>2004-08-09T09:04:08Z</timestamp>
      </revision>
      <revision>
        <id>2</id>
        <timestamp>2004-08-10T09:04:08Z</timestamp>
      </revision>
    </page>
    """

    dump = Dump.from_page_xml(io.StringIO(page_xml))

    # You have a `namespaces`, but it's empty.
    eq_(dump.site_info.namespaces, None)

    page = dump.next()
    eq_(page.title, u"Foo")
    eq_(page.namespace, 0)
    eq_(page.id, 1)

    revision = page.next()
    eq_(revision.id, 1)
    eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-09T09:04:08Z"))

    revision = page.next()
    eq_(revision.id, 2)
    eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-10T09:04:08Z"))
    def process(self, user, timestamp, data=None):
        """
        Processes a user event.

        :Parameters:
            user : `hashable`
                A hashable value to identify a user (`int` or `str` are OK)
            timestamp : :class:`mwtypes.Timestamp`
                The timestamp of the event
            data : `mixed`
                Event meta data

        :Returns:
            A generator of :class:`~mwsessions.Session` expired after
            processing the user event.
        """
        event = Event(user, mwtypes.Timestamp(timestamp), self.event_i, data)
        self.event_i += 1

        for user, events in self._clear_expired(event.timestamp):
            yield Session(user, unpack_events(events))

        # Apply revision
        if event.user in self.active_users:
            events = self.active_users[event.user]
        else:
            events = []
            self.active_users[event.user] = events
            active_session = ActiveSession(event.timestamp, event.i, events)
            self.recently_active.push(active_session)

        events.append(event)
Exemple #4
0
def user_blocks(user_text, session):
    """
    Returns a list of blocks for a single user
    """
    logger.debug("Getting blocks for {0}".format(user_text))
    doc = session.get(action='query', list='blocks', bkusers=user_text,
                      bkprop=['id', 'timestamp'])
    return [mwtypes.Timestamp(b['timestamp']) for b in doc['query']['blocks']]
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.DEBUG if args['--debug'] else logging.INFO,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )
    if args['--class-weight'] is not None:
        class_weights = dict(
            map(_parse_class_weight_option, args['--class-weight'])
        )
        global CLASS_WEIGHTS
        CLASS_WEIGHTS.update(class_weights)

    paths = args['<dump-file>']
    with open(args['--model']) as f:
        model = Model.load(f)

    sunset = mwtypes.Timestamp(args['--sunset'])

    if args['--score-at'] not in SCORE_ATS:
        raise ValueError("--score-at value {0} not available in {1}"
                         .format(args['--score-at'], SCORE_ATS))
    else:
        score_at = args['--score-at']

    if args['--rev-scores'] == "<stdout>":
        rev_scores = mysqltsv.Writer(sys.stdout, headers=HEADERS)
    else:
        rev_scores = mysqltsv.Writer(
            open(args['--rev-scores'], "w"), headers=HEADERS)

    if args['--extend'] is None:
        skip_scores_before = {}
    else:
        logger.info("Reading in past scores from {0}".format(args['--extend']))
        skip_scores_before = {}
        rows = mysqltsv.read(
            open(args['--extend']),
            types=[int, str, int, mwtypes.Timestamp, str, float])
        for row in rows:
            skip_scores_before[row.page_id] = row.timestamp
        logger.info("Completed reading scores from old output.")

    if args['--processes'] == "<cpu count>":
        processes = cpu_count()
    else:
        processes = int(args['--processes'])

    verbose = args['--verbose']
    run(paths, model, sunset, score_at, rev_scores, skip_scores_before,
        processes, verbose=verbose)
def test_revision():
    XML = u"""
    <revision>
      <id>233192</id>
      <timestamp>2001-01-21T02:12:21Z</timestamp>
      <contributor>
        <username>RoseParks</username>
        <id>99</id>
      </contributor>
      <comment>*</comment>
      <minor />
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text xml:space="preserve">Text of rev 233192</text>
      <sha1>8kul9tlwjm9oxgvqzbwuegt9b2830vw</sha1>
    </revision>
    """
    revision = Revision.from_element(ElementIterator.from_string(XML))
    eq_(revision.id, 233192)
    eq_(revision.timestamp, mwtypes.Timestamp(u"2001-01-21T02:12:21Z"))
    eq_(revision.user.id, 99)
    eq_(revision.user.text, u"RoseParks")
    eq_(revision.comment, u"*")
    eq_(revision.minor, True)
    eq_(revision.model, u"wikitext")
    eq_(revision.format, u"text/x-wiki")
    eq_(revision.text, u"Text of rev 233192")
    eq_(revision.sha1, u"8kul9tlwjm9oxgvqzbwuegt9b2830vw")
    eq_(revision.deleted.text, False)
    eq_(revision.deleted.comment, False)
    eq_(revision.deleted.user, False)

    XML = u"""
    <revision>
      <id>233192</id>
      <timestamp>2001-01-21T02:12:21Z</timestamp>
      <contributor deleted="deleted"></contributor>
      <comment deleted="deleted" />
      <minor />
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text xml:space="preserve" deleted="deleted" />
      <sha1>8kul9tlwjm9oxgvqzbwuegt9b2830vw</sha1>
    </revision>
    """
    revision = Revision.from_element(ElementIterator.from_string(XML))
    eq_(revision.user, None)
    eq_(revision.comment, None)
    eq_(revision.text, None)
    eq_(revision.deleted.text, True)
    eq_(revision.deleted.comment, True)
    eq_(revision.deleted.user, True)
Exemple #7
0
def test_complete():
    f = io.StringIO(SAMPLE_XML)

    dump = Dump.from_file(f)
    eq_([0, 1], list(ns.id for ns in dump.site_info.namespaces))

    page = dump.next()
    eq_(page.title, u"Foo")
    eq_(page.namespace, 0)
    eq_(page.id, 1)
    eq_(page.redirect, None)
    eq_(page.restrictions, [])

    revision = page.next()
    eq_(revision.id, 1)
    eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-09T09:04:08Z"))

    revision = page.next()
    eq_(revision.id, 2)
    eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-10T09:04:08Z"))

    page = dump.next()
    assert_is_instance(page, Page)
    eq_(page.title, u"Bar")
    eq_(page.namespace, 1)
    eq_(page.id, 2)
    eq_(page.redirect, u"Computer accessibility")
    eq_(page.restrictions, [u"edit=sysop:move=sysop"])

    revision = page.next()
    assert_is_instance(revision, Revision)
    eq_(revision.id, 3)
    eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-11T09:04:08Z"))

    revision = page.next()
    assert_is_instance(revision, Revision)
    eq_(revision.id, 4)
    eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-12T09:04:08Z"))
Exemple #8
0
def extract_text(dump, page_labelings, verbose=False):
    """
    Extracts article text and metadata for labelings from an XML dump.

    :Parameters:
        dump : :class:`mwxml.Dump`
            The XML dump file to extract text & metadata from
        labelings : `iterable`(`dict`)
            A collection of labeling events to add text to
        verbose : `bool`
            Print dots and stuff

    :Returns:
        An `iterator` of labelings augmented with 'page_id', 'rev_id' and
        'text'.  Note that labelings of articles that can't be looked up will
        not be included.
    """
    for page in dump:

        if page.namespace == 0 and page.title in page_labelings:
            if verbose:
                sys.stderr.write("\n{0}: ".format(page.title))
                sys.stderr.flush()

            labelings = page_labelings[page.title]

            last_revision = None
            for revision in page:
                while last_revision is not None and \
                        len(labelings) > 0 and \
                        revision.timestamp > \
                        mwtypes.Timestamp(labelings[0]['timestamp']):
                    labeling = labelings.pop()
                    labeling['page_id'] = page.id
                    labeling['rev_id'] = last_revision.id
                    if not_an_article(last_revision.text):
                        labeling['text'] = None
                    else:
                        labeling['text'] = last_revision.text

                    yield labeling

                    if verbose:
                        sys.stderr.write("t")
                        sys.stderr.flush()

                # Don't update last_revision if the text was deleted
                if revision.text is not None:
                    last_revision = revision
Exemple #9
0
def test_skipping():
    f = io.StringIO(SAMPLE_XML)

    dump = Dump.from_file(f)

    page = dump.next()
    eq_(page.title, u"Foo")
    eq_(page.namespace, 0)
    eq_(page.id, 1)

    page = dump.next()
    eq_(page.title, u"Bar")
    eq_(page.namespace, 1)
    eq_(page.id, 2)

    revision = page.next()
    eq_(revision.id, 3)
    eq_(revision.timestamp, mwtypes.Timestamp(u"2004-08-11T09:04:08Z"))
Exemple #10
0
import logging
from datetime import datetime

import mwtypes
from pytz import utc

from ...datasources import revision_oriented
from ...dependencies import DependentSet
from ..feature import Feature

MW_REGISTRATION_EPOCH = mwtypes.Timestamp("2006-01-01T00:00:00Z")

logger = logging.getLogger(__name__)


class Revision(DependentSet):
    "Represents a revision"

    def __init__(self, name, revision_datasources):
        super().__init__(name)
        self.datasources = revision_datasources

        self.day_of_week = Feature(name + ".day_of_week",
                                   _process_day_of_week,
                                   returns=int,
                                   depends_on=[revision_datasources.timestamp])
        "`int` : the day of week when the edit was made (in UTC)"

        self.hour_of_day = Feature(name + ".hour_of_day",
                                   _process_hour_of_day,
                                   returns=int,
Exemple #11
0
 def process(self, timestamp_str):
     return mwtypes.Timestamp(timestamp_str or MW_REGISTRATION_EPOCH)
Exemple #12
0
    def from_element(cls, element):

        id = None
        timestamp = None
        user = None
        user_deleted = False
        minor = False
        comment = None
        comment_deleted = False
        text = None
        text_deleted = False
        str = None
        sha1 = None
        parent_id = None
        model = None
        format = None

        for sub_element in element:
            tag = sub_element.tag
            if tag == u"id":
                id = int(sub_element.text)
            elif tag == u"timestamp":
                timestamp = mwtypes.Timestamp(sub_element.text)
            elif tag == u"contributor":
                user_deleted = sub_element.attr(u'deleted') is not None
                if not user_deleted:
                    user = User.from_element(sub_element)
            elif tag == u"minor":
                minor = True
            elif tag == u"sha1":
                sha1 = sub_element.text
            elif tag == u"parentid":
                parent_id = sub_element.text
            elif tag == u"model":
                model = sub_element.text
            elif tag == u"format":
                format = sub_element.text
            elif tag == u"comment":
                comment_deleted = sub_element.attr(u'deleted') is not None
                if not comment_deleted:
                    comment = sub_element.text
            elif tag == u"text":
                text_deleted = sub_element.attr(u'deleted') is not None
                if not text_deleted:
                    text = sub_element.text
                str = sub_element.attr(u'bytes')
            else:
                raise MalformedXML(u"Unexpected tag found when processing " +
                                   u"a <revision>: '{0}'".format(tag))

        deleted = cls.Deleted(comment=comment_deleted,
                              text=text_deleted,
                              user=user_deleted)

        return cls(id,
                   timestamp,
                   user=user,
                   minor=minor,
                   str=str,
                   sha1=sha1,
                   parent_id=parent_id,
                   model=model,
                   format=format,
                   comment=comment,
                   text=text,
                   deleted=deleted)
Exemple #13
0
    def from_element(cls, element, namespace_map=None):
        id = None
        timestamp = None
        comment = None
        user = None
        page = None
        type = None
        action = None
        text = None
        params = None
        comment_deleted = None
        user_deleted = None

        for sub_element in element:
            tag = sub_element.tag
            if tag == "id":
                id = int(sub_element.text)
            elif tag == "timestamp":
                timestamp = mwtypes.Timestamp(sub_element.text)
            elif tag == "comment":
                comment_deleted = sub_element.attr('deleted') is not None
                if not comment_deleted:
                    comment = sub_element.text
            elif tag == "contributor":
                user_deleted = sub_element.attr('deleted') is not None
                if not user_deleted:
                    user = User.from_element(sub_element)
            elif tag == "logtitle":
                if sub_element.text is None:
                    namespace = None
                    title = None
                elif namespace_map is not None:
                    namespace, title = extract_namespace(
                        sub_element.text, namespace_map)
                else:
                    namespace = None
                    title = element.text
                page = cls.Page(namespace=namespace, title=title)
            elif tag == "type":
                type = sub_element.text
            elif tag == "action":
                action = sub_element.text
            elif tag == "text":
                logger.warn("A <text> tag was seen in a log item ... ignoring")
            elif tag == "params":
                params = sub_element.text
            else:
                raise MalformedXML("Unexpected tag found when processing " +
                                   "a <logitem>: '{0}'".format(tag))

        deleted = cls.Deleted(comment=comment_deleted, user=user_deleted)

        return cls(id=id,
                   timestamp=timestamp,
                   comment=comment,
                   user=user,
                   page=page,
                   type=type,
                   action=action,
                   text=text,
                   params=params,
                   deleted=deleted)