def get_wiki_info(session): doc = session.get(action="query", meta="siteinfo", siprop=["namespaces", "namespacealiases", "general"], formatversion=2) forbidden_namespaces = set() for namespace in doc['query']['namespaces'].values(): if namespace['id'] in WikitextPreprocessor.FORBIDDEN_NAMESPACE_IDS: forbidden_namespaces.add(namespace['name'].lower()) forbidden_namespaces.add(namespace['canonical'].lower()) for namespace in doc['query']['namespacealiases']: if namespace['id'] in WikitextPreprocessor.FORBIDDEN_NAMESPACE_IDS: forbidden_namespaces.add(namespace['alias'].lower()) return doc['query']['general']['lang'], forbidden_namespaces def is_article(text): return not (text is None or len(text) < 50 or REDIRECT_RE.match(text)) streamer = mwcli.Streamer(__doc__, __name__, preprocess_text, process_args=process_args, file_reader=mwcli.Streamer.read_xml, line_writer=mwcli.Streamer.write_line) main = streamer.main
if len(args['--content-model']) == 0: allowed_content_models = None else: allowed_content_models = set(cm for cm in args['--content-model']) min_content_length = int(args['--min-content-length']) return { 'transformer': transformer, 'include_criteria': include_criteria, 'include_redirects': include_redirects, 'allowed_namespaces': allowed_namespaces, 'allowed_content_models': allowed_content_models, 'min_content_length': min_content_length } def process_param(kv): key, value_str = kv.split("=", 1) return key, json.loads(value_str) streamer = mwcli.Streamer(__doc__, __name__, transform_content, process_args=process_args, file_reader=mwcli.Streamer.read_xml, line_writer=mwcli.Streamer.write_json) main = streamer.main
for page in dump: if verbose: sys.stderr.write(page.title + u": ") sys.stderr.flush() for revision in page: yield revision.to_json() if verbose: sys.stderr.write(u".") sys.stderr.flush() if verbose: sys.stderr.write(u"\n") sys.stderr.flush() def process_args(args): return {} streamer = mwcli.Streamer( __doc__, __name__, dump2revdocs, process_args, file_reader=Dump.from_file ) main = streamer.main
seconds_possible = max(sunset - Timestamp(rev_doc['timestamp']), 0) return { 'revisions_processed': len(window), 'non_self_processed': sum(rd['user'] != rev_doc['user'] for rd, _ in window), 'seconds_possible': seconds_possible, 'tokens': [td for td in generate_token_docs(rev_doc, tokens_added)] } def generate_token_docs(rev_doc, tokens_added): for token in tokens_added: yield { "text": str(token), "persisted": len(token.revisions) - 1, "non_self_persisted": sum(u != rev_doc['user'] for u, _ in token.revisions), "seconds_visible": sum(sv for _, sv in token.revisions) } streamer = mwcli.Streamer(__doc__, __name__, _diffs2persistence, process_args) main = streamer.main
if verbose: if changed: sys.stderr.write(u"!") else: sys.stderr.write(u".") sys.stderr.flush() yield rev_doc def trim_dict(d): changed = False keys_to_del = [] for key, value in d.items(): if value is None: keys_to_del.append(key) elif isinstance(value, dict): changed = trim_dict(value) if len(value) == 0: keys_to_del.append(key) if len(keys_to_del) > 0: changed = True for key in keys_to_del: del d[key] return changed streamer = mwcli.Streamer(__doc__, __name__, normalize) main = streamer.main
processor threads? [default: <cpu_count>] --output=<path> Write output to a directory with one output file per input path. [default: <stdout>] --compress=<type> If set, output written to the output-dir will be compressed in this format. [default: bz2] --verbose Print progress information to stderr. Kind of a mess when running multi-threaded. --debug Print debug logs. """ from __future__ import absolute_import import json import jsonschema import mwcli from io import open def process_args(args): return {u'schema': json.load(open(args[u'--schema']))} def validate(docs, schema, verbose=False): for doc in docs: jsonschema.validate(doc, schema) yield doc streamer = mwcli.Streamer(__doc__, __name__, validate, process_args) main = streamer.main
include, exclude, keep_text=False, keep_diff=False, keep_tokens=False, verbose=False): diff_docs = mwdiffs.utilities.revdocs2diffs(rev_docs, diff_engine, namespaces, timeout) if not keep_text: diff_docs = mwdiffs.utilities.drop_text(diff_docs) persistence_docs = diffs2persistence(diff_docs, window_size, revert_radius, sunset, verbose=verbose) if not keep_diff: persistence_docs = drop_diff(persistence_docs) stats_docs = persistence2stats(persistence_docs, min_persisted, min_visible, include, exclude) if not keep_tokens: stats_docs = drop_tokens(stats_docs) yield from stats_docs streamer = mwcli.Streamer(__doc__, __name__, revdocs2stats, process_args) main = streamer.main
# Look for review threshold stats_doc['persistent_tokens'] += \ token_doc['persisted'] >= min_persisted stats_doc['non_self_persistent_tokens'] += \ token_doc['non_self_persisted'] >= min_persisted # Check for censoring if persistence_doc['seconds_possible'] < min_visible: stats_doc['censored'] = True stats_doc['non_self_censored'] = True else: if persistence_doc['revisions_processed'] < min_persisted: stats_doc['censored'] = True if persistence_doc['non_self_processed'] < min_persisted: stats_doc['non_self_censored'] = True if verbose: sys.stderr.write("\n") sys.stderr.flush() rev_doc['persistence'].update(stats_doc) yield rev_doc streamer = mwcli.Streamer(__doc__, __name__, _persistence2stats, process_args) main = streamer.main
mess when running multi-threaded. --debug Print debug logs. """ from __future__ import absolute_import import sys import mwcli def _single_inflate(flat_json): inflated = {} flat_keys = flat_json.keys() for key in flat_keys: bottom_dict = inflated parts = key.split(u'_') for sub_key in parts[:-1]: if sub_key not in bottom_dict: bottom_dict[sub_key] = {} bottom_dict = bottom_dict[sub_key] bottom_dict[parts[-1]] = flat_json[key] return inflated def inflate(flat_jsons, verbose=False): for flat_json in flat_jsons: inflated = _single_inflate(flat_json) yield inflated streamer = mwcli.Streamer(__doc__, __name__, inflate) main = streamer.main
--compress=<type> If set, output written to the output-dir will be compressed in this format. [default: bz2] --verbose Print progress information to stderr. --debug Print debug logging to stderr. """ import logging import mwcli import mwxml from .revdocs2stats import process_args as revdocs2stats_args from .revdocs2stats import revdocs2stats logger = logging.getLogger(__name__) def dump2stats(dump, *args, **kwargs): rev_docs = mwxml.utilities.dump2revdocs(dump) stats_docs = revdocs2stats(rev_docs, *args, **kwargs) yield from stats_docs streamer = mwcli.Streamer(__doc__, __name__, dump2stats, revdocs2stats_args, file_reader=mwxml.Dump.from_file) main = streamer.main