Esempio n. 1
0
def get_wiki_info(session):
    doc = session.get(action="query",
                      meta="siteinfo",
                      siprop=["namespaces", "namespacealiases", "general"],
                      formatversion=2)
    forbidden_namespaces = set()
    for namespace in doc['query']['namespaces'].values():
        if namespace['id'] in WikitextPreprocessor.FORBIDDEN_NAMESPACE_IDS:
            forbidden_namespaces.add(namespace['name'].lower())
            forbidden_namespaces.add(namespace['canonical'].lower())
    for namespace in doc['query']['namespacealiases']:
        if namespace['id'] in WikitextPreprocessor.FORBIDDEN_NAMESPACE_IDS:
            forbidden_namespaces.add(namespace['alias'].lower())

    return doc['query']['general']['lang'], forbidden_namespaces


def is_article(text):
    return not (text is None or len(text) < 50 or REDIRECT_RE.match(text))


streamer = mwcli.Streamer(__doc__,
                          __name__,
                          preprocess_text,
                          process_args=process_args,
                          file_reader=mwcli.Streamer.read_xml,
                          line_writer=mwcli.Streamer.write_line)

main = streamer.main
Esempio n. 2
0
    if len(args['--content-model']) == 0:
        allowed_content_models = None
    else:
        allowed_content_models = set(cm for cm in args['--content-model'])

    min_content_length = int(args['--min-content-length'])

    return {
        'transformer': transformer,
        'include_criteria': include_criteria,
        'include_redirects': include_redirects,
        'allowed_namespaces': allowed_namespaces,
        'allowed_content_models': allowed_content_models,
        'min_content_length': min_content_length
    }


def process_param(kv):
    key, value_str = kv.split("=", 1)
    return key, json.loads(value_str)


streamer = mwcli.Streamer(__doc__,
                          __name__,
                          transform_content,
                          process_args=process_args,
                          file_reader=mwcli.Streamer.read_xml,
                          line_writer=mwcli.Streamer.write_json)

main = streamer.main
Esempio n. 3
0
    for page in dump:

        if verbose:
            sys.stderr.write(page.title + u": ")
            sys.stderr.flush()

        for revision in page:
            yield revision.to_json()

            if verbose:
                sys.stderr.write(u".")
                sys.stderr.flush()

        if verbose:
            sys.stderr.write(u"\n")
            sys.stderr.flush()


def process_args(args):
    return {}

streamer = mwcli.Streamer(
    __doc__,
    __name__,
    dump2revdocs,
    process_args,
    file_reader=Dump.from_file
)

main = streamer.main
    seconds_possible = max(sunset - Timestamp(rev_doc['timestamp']), 0)

    return {
        'revisions_processed':
        len(window),
        'non_self_processed':
        sum(rd['user'] != rev_doc['user'] for rd, _ in window),
        'seconds_possible':
        seconds_possible,
        'tokens': [td for td in generate_token_docs(rev_doc, tokens_added)]
    }


def generate_token_docs(rev_doc, tokens_added):
    for token in tokens_added:
        yield {
            "text":
            str(token),
            "persisted":
            len(token.revisions) - 1,
            "non_self_persisted":
            sum(u != rev_doc['user'] for u, _ in token.revisions),
            "seconds_visible":
            sum(sv for _, sv in token.revisions)
        }


streamer = mwcli.Streamer(__doc__, __name__, _diffs2persistence, process_args)
main = streamer.main
Esempio n. 5
0
        if verbose:
            if changed:
                sys.stderr.write(u"!")
            else:
                sys.stderr.write(u".")
            sys.stderr.flush()

        yield rev_doc


def trim_dict(d):
    changed = False
    keys_to_del = []
    for key, value in d.items():
        if value is None:
            keys_to_del.append(key)
        elif isinstance(value, dict):
            changed = trim_dict(value)
            if len(value) == 0:
                keys_to_del.append(key)
    if len(keys_to_del) > 0:
        changed = True
    for key in keys_to_del:
        del d[key]
    return changed


streamer = mwcli.Streamer(__doc__, __name__, normalize)
main = streamer.main
Esempio n. 6
0
                        processor threads? [default: <cpu_count>]
    --output=<path>     Write output to a directory with one output file
                        per input path.  [default: <stdout>]
    --compress=<type>   If set, output written to the output-dir will be
                        compressed in this format. [default: bz2]
    --verbose           Print progress information to stderr.  Kind of a
                        mess when running multi-threaded.
    --debug             Print debug logs.
"""
from __future__ import absolute_import
import json

import jsonschema
import mwcli
from io import open


def process_args(args):
    return {u'schema': json.load(open(args[u'--schema']))}


def validate(docs, schema, verbose=False):
    for doc in docs:
        jsonschema.validate(doc, schema)
        yield doc


streamer = mwcli.Streamer(__doc__, __name__, validate, process_args)

main = streamer.main
Esempio n. 7
0
                  include,
                  exclude,
                  keep_text=False,
                  keep_diff=False,
                  keep_tokens=False,
                  verbose=False):

    diff_docs = mwdiffs.utilities.revdocs2diffs(rev_docs, diff_engine,
                                                namespaces, timeout)
    if not keep_text:
        diff_docs = mwdiffs.utilities.drop_text(diff_docs)

    persistence_docs = diffs2persistence(diff_docs,
                                         window_size,
                                         revert_radius,
                                         sunset,
                                         verbose=verbose)
    if not keep_diff:
        persistence_docs = drop_diff(persistence_docs)

    stats_docs = persistence2stats(persistence_docs, min_persisted,
                                   min_visible, include, exclude)
    if not keep_tokens:
        stats_docs = drop_tokens(stats_docs)

    yield from stats_docs


streamer = mwcli.Streamer(__doc__, __name__, revdocs2stats, process_args)
main = streamer.main
                # Look for review threshold
                stats_doc['persistent_tokens'] += \
                    token_doc['persisted'] >= min_persisted

                stats_doc['non_self_persistent_tokens'] += \
                    token_doc['non_self_persisted'] >= min_persisted

                # Check for censoring
                if persistence_doc['seconds_possible'] < min_visible:
                    stats_doc['censored'] = True
                    stats_doc['non_self_censored'] = True

                else:
                    if persistence_doc['revisions_processed'] < min_persisted:
                        stats_doc['censored'] = True

                    if persistence_doc['non_self_processed'] < min_persisted:
                        stats_doc['non_self_censored'] = True

        if verbose:
            sys.stderr.write("\n")
            sys.stderr.flush()

        rev_doc['persistence'].update(stats_doc)

        yield rev_doc


streamer = mwcli.Streamer(__doc__, __name__, _persistence2stats, process_args)
main = streamer.main
Esempio n. 9
0
                            mess when running multi-threaded.
        --debug             Print debug logs.
"""
from __future__ import absolute_import
import sys
import mwcli


def _single_inflate(flat_json):
    inflated = {}
    flat_keys = flat_json.keys()
    for key in flat_keys:
        bottom_dict = inflated
        parts = key.split(u'_')
        for sub_key in parts[:-1]:
            if sub_key not in bottom_dict:
                bottom_dict[sub_key] = {}
            bottom_dict = bottom_dict[sub_key]
        bottom_dict[parts[-1]] = flat_json[key]
    return inflated


def inflate(flat_jsons, verbose=False):
    for flat_json in flat_jsons:
        inflated = _single_inflate(flat_json)
        yield inflated


streamer = mwcli.Streamer(__doc__, __name__, inflate)
main = streamer.main
Esempio n. 10
0
        --compress=<type>       If set, output written to the output-dir will
                                be compressed in this format. [default: bz2]
        --verbose               Print progress information to stderr.
        --debug                 Print debug logging to stderr.
"""
import logging

import mwcli
import mwxml

from .revdocs2stats import process_args as revdocs2stats_args
from .revdocs2stats import revdocs2stats

logger = logging.getLogger(__name__)


def dump2stats(dump, *args, **kwargs):

    rev_docs = mwxml.utilities.dump2revdocs(dump)
    stats_docs = revdocs2stats(rev_docs, *args, **kwargs)

    yield from stats_docs


streamer = mwcli.Streamer(__doc__,
                          __name__,
                          dump2stats,
                          revdocs2stats_args,
                          file_reader=mwxml.Dump.from_file)
main = streamer.main