Python mapの例、mwxml.map Pythonの例

コード例 #1

0

ファイルを表示

def extract(dump_files, extractors=ALL_EXTRACTORS):
    """
    Extracts cites from a set of `dump_files`.

    :Parameters:
        dump_files : str | `file`
            A set of files MediaWiki XML dump files
            (expects: pages-meta-history)
        extractors : `list`(`extractor`)
            A list of extractors to apply to the text

    :Returns:
        `iterable` -- a generator of extracted cites

    """
    # Dump processor function
    def process_dump(dump, path):
        for page in dump:
            if page.namespace != 0: continue
            else:
                for cite in extract_cite_history(page, extractors):
                    yield cite

    # Map call
    return mwxml.map(process_dump, dump_files)

コード例 #2

0

ファイルを表示

ファイル: extract.py プロジェクト: tab1tha/TopicContribs

def analyse_dumps(dumps, cohorts, pages, threads=None):
    results = _init_cohort_contribs(cohorts)
    _partial = partial(_analyse_single_dump, cohorts=cohorts, pages=pages)
    for sub_res in mwxml.map(_partial, dumps, threads=threads):
        for cohort in sub_res:
            results[cohort].update(sub_res[cohort])
    return results

コード例 #3

0

ファイルを表示

def run(paths,
        model,
        sunset,
        score_at,
        rev_scores,
        skip_scores_before,
        processes,
        verbose=False):

    if score_at == "revision":
        process_dump = revision_scores(model, sunset, skip_scores_before)
    elif score_at == "latest":
        process_dump = latest_scores(model, sunset, skip_scores_before)
    else:
        sunset_year = int(sunset.strftime("%Y"))
        if score_at == "monthly":
            dates = chain(*(zip([year] * 12, MONTHS)
                            for year in range(START_YEAR, sunset_year + 1)))
            thresholds = [
                mwtypes.Timestamp(str(year) + month + "01000000")
                for year, month in dates
            ]
        elif score_at == "biannually":
            dates = chain(*(zip([year] * 2, ["01", "07"])
                            for year in range(START_YEAR, sunset_year + 1)))
            thresholds = [
                mwtypes.Timestamp(str(year) + month + "01000000")
                for year, month in dates
            ]
        elif score_at == "annually":
            thresholds = [
                mwtypes.Timestamp(str(year) + "0101000000")
                for year in range(START_YEAR, sunset_year + 1)
            ]
        else:
            raise RuntimeError(
                "{0} is not a valid 'score_at' value".format(score_at))
        process_dump = threshold_scores(model, sunset, skip_scores_before,
                                        thresholds)

    results = mwxml.map(process_dump, paths, threads=processes)
    for page_id, title, rev_id, timestamp, (e, score) in results:
        if e is not None:
            logger.error("Error while processing {0}({1}) @ {2}: {3}".format(
                title, page_id, rev_id, e))
            continue

        weighted_sum = sum(CLASS_WEIGHTS[cls] * score['probability'][cls]
                           for cls in score['probability'])
        rev_scores.write([
            page_id, title, rev_id,
            timestamp.short_format(), score['prediction'], weighted_sum
        ])

        if verbose:
            sys.stderr.write(score['prediction'] + " ")
            sys.stderr.flush()

    if verbose:
        sys.stderr.write("\n")

コード例 #4

0

ファイルを表示

ファイル: extract_subst_template_additions.py プロジェクト: halfak/teahouse_experiment

def run(dump_files):

    def process_dump(dump, path):
        for page in dump:
            if page.namespace != 3:
                continue

            last_templates = defaultdict(lambda: 0)
            for revision in page:
                if revision.text is None:
                    continue

                current_templates = defaultdict(lambda: 0)
                for template in subst_templates.extract(revision.text):
                    current_templates[template] += 1

                for template, count in current_templates.items():
                    diff = current_templates[template] - \
                           last_templates[template]
                    if diff > 0:
                        yield (page.id, page.namespace, page.title,
                               revision.id, revision.timestamp,
                               revision.comment, template, diff)

                current_templates = last_templates

    writer = mysqltsv.Writer(sys.stdout, headers=HEADERS)

    for values in mwxml.map(process_dump, dump_files):
        sys.stderr.write(".")
        writer.write(values)

    sys.stderr.write("\n")

コード例 #5

0

ファイルを表示

def run(dump_paths, threads, output, verbose=False, extractor=None):

    if len(dump_paths) == 0:
        label_events = extract_labelings(mwxml.Dump.from_file(sys.stdin),
                                         extractor, verbose=verbose)

    else:
        label_events = mwxml.map(lambda d, p:
                                 extract_labelings(d, extractor, verbose),
                                 dump_paths, threads=threads)

    for labeling in label_events:
        dump_observation(labeling, output)

コード例 #6

0

ファイルを表示

def run(dump_paths, threads, output, verbose=False):

    if len(dump_paths) == 0:
        user_edits = extract_rev_data(mwxml.Dump.from_file(sys.stdin),
                                      verbose=verbose)

    else:
        user_edits = mwxml.map(lambda d, p: extract_rev_data(d, verbose),
                               dump_paths,
                               threads=threads)

    for edit in user_edits:
        json.dump(edit, output)
        output.write("\n")

コード例 #7

0

ファイルを表示

ファイル: score_article_periods.py プロジェクト: halfak/editclass

def run(page_periods, scorer_model, dump_paths):

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    writer = mysqltsv.Writer(sys.stdout, headers=HEADERS)

    def process_dump(dump, path):

        for page in dump:
            if page.namespace != 0 or page.id not in page_periods:
                continue
            else:
                start_id, end_id = page_periods[page.id]

            #sys.stderr.write(page.title + ": ");sys.stderr.flush()

            pre_period_revision = None
            for revision in page:
                if revision.id < start_id:
                    #sys.stderr.write(".");sys.stderr.flush()
                    pre_period_revision = revision

                if revision.id == end_id:
                    if pre_period_revision is not None:
                        start_text = pre_period_revision.text
                        start_id = pre_period_revision.id
                    else:
                        start_text = ""
                        start_id = None
                    
                    start_score = generate_score(scorer_model, start_text)
                    #sys.stderr.write("s1");sys.stderr.flush()
                    end_score = generate_score(scorer_model, revision.text)
                    #sys.stderr.write("s2");sys.stderr.flush()
                    yield (page.id,
                           pre_period_revision.id,
                           start_score['prediction'], weighted_sum(start_score),
                           revision.id,
                           end_score['prediction'], weighted_sum(end_score))

                    break

            #sys.stderr.write("\n")

    for values in mwxml.map(process_dump, dump_paths):
        writer.write(values)

コード例 #8

0

ファイルを表示

ファイル: extract_scores.py プロジェクト: wiki-ai/wikiclass

def run(paths, model, sunset, score_at, rev_scores, skip_scores_before,
        processes, verbose=False):

    if score_at == "revision":
        process_dump = revision_scores(model, sunset, skip_scores_before)
    elif score_at == "latest":
        process_dump = latest_scores(model, sunset, skip_scores_before)
    else:
        sunset_year = int(sunset.strftime("%Y"))
        if score_at == "monthly":
            dates = chain(*(zip([year] * 12, MONTHS)
                          for year in range(START_YEAR, sunset_year + 1)))
            thresholds = [mwtypes.Timestamp(str(year) + month + "01000000")
                          for year, month in dates]
        elif score_at == "biannually":
            dates = chain(*(zip([year] * 2, ["01", "07"])
                          for year in range(START_YEAR, sunset_year + 1)))
            thresholds = [mwtypes.Timestamp(str(year) + month + "01000000")
                          for year, month in dates]
        elif score_at == "annually":
            thresholds = [mwtypes.Timestamp(str(year) + "0101000000")
                          for year in range(START_YEAR, sunset_year + 1)]
        else:
            raise RuntimeError("{0} is not a valid 'score_at' value"
                               .format(score_at))
        process_dump = threshold_scores(
            model, sunset, skip_scores_before, thresholds)

    results = mwxml.map(process_dump, paths, threads=processes)
    for page_id, title, rev_id, timestamp, (e, score) in results:
        if e is not None:
            logger.error("Error while processing {0}({1}) @ {2}: {3}"
                         .format(title, page_id, rev_id, e))
            continue

        weighted_sum = sum(CLASS_WEIGHTS[cls] * score['probability'][cls]
                           for cls in score['probability'])
        rev_scores.write(
            [page_id, title, rev_id, timestamp.short_format(),
             score['prediction'], weighted_sum])

        if verbose:
            sys.stderr.write(score['prediction'] + " ")
            sys.stderr.flush()

    if verbose:
        sys.stderr.write("\n")

コード例 #9

0

ファイルを表示

ファイル: extract_text.py プロジェクト: wiki-ai/wikiclass

def run(dump_paths, page_labelings, output, threads, verbose=False):
    logging.basicConfig(
        level=logging.DEBUG if verbose else logging.WARNING,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    if len(dump_paths) == 0:
        labelings = extract_text(mwxml.Dump.from_file(sys.stdin),
                                 page_labelings, verbose=verbose)

    else:
        labelings = mwxml.map(lambda d, p:
                              extract_text(d, page_labelings, verbose),
                              dump_paths, threads=threads)

    for labeling in labelings:
        dump_observation(labeling, output)

コード例 #10

0

ファイルを表示

def run(dump_paths, page_labelings, output, threads, verbose=False):
    logging.basicConfig(
        level=logging.DEBUG if verbose else logging.WARNING,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    if len(dump_paths) == 0:
        labelings = extract_text(mwxml.Dump.from_file(sys.stdin),
                                 page_labelings,
                                 verbose=verbose)

    else:
        labelings = mwxml.map(lambda d, p: \
                                    extract_text(d, page_labelings, verbose),
                              dump_paths, threads=threads)

    for labeling in labelings:
        dump_observation(labeling, output)

コード例 #11

0

ファイルを表示

ファイル: extract_labelings.py プロジェクト: ghosthamlet/wikiclass

def run(dump_paths, threads, output, verbose=False, extractor=None):
    logging.basicConfig(
        level=logging.DEBUG if verbose else logging.WARNING,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )


    if len(dump_paths) == 0:
        label_events = dump2labels(mwxml.Dump.from_file(sys.stdin),
                                   extractor, verbose=verbose)

    else:
        label_events = mwxml.map(lambda d, p: \
                                    dump2labels(d, extractor, verbose),
                                 dump_paths,threads=threads)

    for labeling in label_events:

        json.dump(labeling, output)
        output.write("\n")

コード例 #12

0

ファイルを表示

ファイル: extract_teahouse_link_additions.py プロジェクト: halfak/teahouse_experiment

def run(dump_files):
    def process_dump(dump, path):
        for page in dump:
            if page.namespace != 3:
                continue

            last_links = defaultdict(lambda: 0)
            for revision in page:
                if revision.text is None:
                    continue

                current_links = defaultdict(lambda: 0)
                for link in teahouse_links.extract(revision.text):
                    current_links[link] += 1

                for link, count in current_links.items():
                    diff = current_links[link] - last_links[link]
                    if diff > 0:
                        yield (
                            page.id,
                            page.namespace,
                            page.title,
                            revision.id,
                            revision.timestamp,
                            revision.comment,
                            link,
                            diff,
                        )

                last_links = current_links

    writer = mysqltsv.Writer(sys.stdout, headers=HEADERS)

    for values in mwxml.map(process_dump, dump_files):
        sys.stderr.write(".")
        writer.write(values)

    sys.stderr.write("\n")

コード例 #13

0

ファイルを表示

ファイル: parse-revisions.py プロジェクト: digitalTranshumant/WikiRecNet-ComplexRec2020

def parse(src, dst):
    files = glob.glob(src)

    with open(dst, 'w', buffering=1000) as f:
        for r in mwxml.map(map_, files):
            f.write('\t'.join(map(str, r)) + '\n')

コード例 #14

0

ファイルを表示

ファイル: generate_anchor_dictionary.py プロジェクト: dedcode/mwaddlink

        if page.redirect:
            page_title_rd = normalise_title(page.redirect)
        else:
            page_title_rd = None
        yield page_id, page_title, page_title_rd
    print("Done processing path:", path)


# we get two dictionaries
# pageids={page_title:page_id} ## only for non-redirect-pages
# redirects={page_title:page_title_rd}, where page_title_rd is the redirected-to page-title ## onlöy for redirect mpages
print('1st pass: getting pageids and redirects tables')
pageids = {}
redirects = {}
pbar = tqdm()
for page_id, page_title, page_title_rd, in mwxml.map(
        process_dump_pageids_redirects, paths, threads=threads):
    if page_title_rd == None:
        pageids[page_title] = page_id
    elif len(page_title_rd) > 0:
        redirects[page_title] = page_title_rd
    else:
        pass
    pbar.update(1)
pbar.close()

print('extraction done.')
print("Number of pages", len(pageids))
print("Number of redirects", len(redirects))

################################################
# dump iteration on each page to get links and anchors

コード例 #15

0

ファイルを表示

ファイル: extract_damaging.py プロジェクト: tgr/editquality

def run(paths, session, start, end, revert_radius, revert_window,
        reverted_only, trusted_users, trusted_edits, rev_reverteds,
        check_blocked, verbose=False):

    def process_dump(dump, path):
        for page in dump:
            detector = mwreverts.Detector(radius=revert_radius)
            window = deque(maxlen=revert_radius)
            for revision in page:
                revision.maybe_damaging = None
                revision.reason = None
                revert = detector.process(revision.sha1, revision)

                if start and revision.timestamp < start:
                    continue
                if end and revision.timestamp > end:
                    continue
                window.append(revision)

                if revert is not None:
                    # A revert!
                    for reverted in revert.reverteds:
                        if (revision.timestamp -
                            reverted.timestamp) <= revert_window and \
                           reverted.user is not None and \
                           revision.user is not None and \
                           reverted.user.text != revision.user.text and \
                           reverted.maybe_damaging is not False:
                            # Happened within the window
                            # wasn't a self revert and hasn't
                            # already been marked good.
                            reverted.maybe_damaging = True
                            reverted.reason = "Reverted by someone else"

                    if revert.reverted_to.maybe_damaging and \
                       revert.reverted_to.user.text != revision.user.text:
                        # Reverted back to my someone else.  Mark it good
                        # again.
                        revert.reverted_to.maybe_damaging = False
                        revert.reverted_to.reason = "Reverted back by " + \
                                                    "someone else"

                # Get user info
                load_user_data = trusted_edits or check_blocked
                if revision.user.id is not None and revision.user.id > 0 and \
                        load_user_data:
                    info = load_user_info(revision.user.text, session)
                else:
                    info = User(revision.user.id, 0, set())

                two_days_later = revision.timestamp + (60 * 60 * 24 * 2)
                if trusted_users and info.id in trusted_users:
                    revision.maybe_damaging = False
                    revision.reason = "In trusted group"
                elif check_blocked and user_recently_blocked(
                        revision.user.text, session, two_days_later):
                    # User was blocked. Edits may be damaging!
                    revision.maybe_damaging = True
                    revision.reason = "User was blocked from editing"
                elif trusted_edits and info.editcount >= trusted_edits:
                    revision.maybe_damaging = False
                    revision.reason = "Enough edits to be trusted"
                else:
                    revision.reason = "Unknown"

                if len(window) == revert_radius:
                    old_revision = window.popleft()
                    yield (old_revision.id, old_revision.maybe_damaging,
                           old_revision.reason)

            for old_revision in window:
                yield (old_revision.id, old_revision.maybe_damaging,
                       old_revision.reason)

    for rev_id, maybe_damaging, reason in mwxml.map(process_dump, paths):
        rev_reverteds.write([rev_id, bool(maybe_damaging), reason])
        if maybe_damaging:
            if reason and "Reverted" in reason:
                if verbose:
                    sys.stderr.write("r")
                    sys.stderr.flush()
            elif reason and "blocked" in reason:
                if verbose:
                    sys.stderr.write("b")
                    sys.stderr.flush()
            else:  # "Unknown" in reason:
                if verbose:
                    sys.stderr.write(".")
                    sys.stderr.flush()
        else:
            if reason and "edits" in reason:
                if verbose:
                    sys.stderr.write("e")
                    sys.stderr.flush()
            elif reason and "Unknown" in reason:
                if verbose:
                    sys.stderr.write(".")
                    sys.stderr.flush()
            else:  # "group" in reason:
                if verbose:
                    sys.stderr.write("g")
                    sys.stderr.flush()

    if verbose:
        sys.stderr.write("\n")

コード例 #16

0

ファイルを表示

ファイル: generate_backtesting_data.py プロジェクト: dedcode/mwaddlink


##################
# Global variables
# TODO: if necessary, parametrize the script to take these as input
count = 0
# LIMIT_SENTS = 200000 ## see above
# Storage array of sentences
wiki_links = []

##################
# mwxml parallelism
# this might apply to english only
# maybe it's possible to split other dumps to part-files
pbar = tqdm(total=LIMIT_SENTS)
for title, sentence in mwxml.map(process_dump, paths, threads=10):
    wiki_links.append((title, sentence))
    count += 1
    if count >= LIMIT_SENTS:
        break
    pbar.update(1)
pbar.close()
print('number of sentences extracted', len(wiki_links))

##################
# store two files
# training: to be used by the generate_training_data.py, because in the future we might want to use the whole sentence.
# test: this is effectively the data used by the backtesting protocol.
LIMIT_SENTS_SPLIT = len(wiki_links) // 2
print(LIMIT_SENTS_SPLIT)

コード例 #17

0

ファイルを表示

def run(input_files, revisions_output_file, verbose):

    def process_pages(stub_file_dump_object, file_url):
        for stub_file_page in stub_file_dump_object:
        
            for stub_file_page_revision in stub_file_page:

                revision_comment = stub_file_page_revision.comment
                revision_user_id = "NULL"
                revision_user_text = "NULL"

                if revision_comment is None:
                    revision_comment = "NULL"

                if stub_file_page_revision.user is None:       
                    logger.warning(
                        "No user. Fields will be NULL. Revision: {0}"
                        .format(stub_file_page_revision))
                    revision_user_id = "NULL"
                    revision_user_text = "NULL"
                elif stub_file_page_revision.user.id is None:
                    revision_user_id = "NULL"
                    revision_user_text = stub_file_page_revision.user.text
                else:
                    revision_user_id = stub_file_page_revision.user.id
                    revision_user_text = stub_file_page_revision.user.text


                timestamp =\
                    datetime.datetime.\
                        utcfromtimestamp(stub_file_page_revision.timestamp)
                    
                cast_timestamp =\
                    str(timestamp.year).zfill(4) +\
                    str(timestamp.month).zfill(2) +\
                    str(timestamp.day).zfill(2) +\
                    str(timestamp.hour).zfill(2) +\
                    str(timestamp.minute).zfill(2) +\
                    str(timestamp.second).zfill(2)
  

                yield stub_file_page_revision.page.title,\
                      stub_file_page_revision.id,\
                      revision_user_id,\
                      revision_user_text,\
                      revision_comment,\
                      stub_file_page.namespace,\
                      cast_timestamp

    i = 0
    for title, revision_id, user_id, user_text, comment, namespace, timestamp\
        in mwxml.map(process_pages, input_files):
        i += 1
        revisions_output_file.write([title, revision_id, user_id, user_text, 
            comment, namespace, timestamp])

        if verbose and i % 10000 == 0:
            sys.stderr.write("Revisions processed: {0}\n".format(i))  
            sys.stderr.flush()

    if verbose:
        sys.stderr.write("Completed writing out result file\n")
        sys.stderr.flush()

コード例 #18

0

ファイルを表示

##################
# Global variables
# TODO: if necessary, parametrize the script to take these as input
count = 0
LIMIT_SENTS = 200000
LIMIT_SENTS_SPLIT = LIMIT_SENTS // 2
# Storage array of sentences
wiki_links = []

##################
# mwxml parallelism
# this might apply to english only
# maybe it's possible to split other dumps to part-files
pbar = tqdm(total=LIMIT_SENTS)
for title, sentence in mwxml.map(process_dump, paths):
    wiki_links.append((title, sentence))
    count += 1
    if count >= LIMIT_SENTS:
        break
    pbar.update(1)
pbar.close()
print('number of sentences extracted', len(wiki_links))

##################
# store two files
# training: to be used by the generate_training_data.py, because in the future we might want to use the whole sentence.
# test: this is effectively the data used by the backtesting protocol.

# Store the sentences for training
with open("../../data/en/training/sentences_train.csv", "w") as f:

コード例 #19

0

ファイルを表示

                        process_language(detect(
                            wikicode.strip_code().strip())))
                except:
                    pass

                languages = list(set(languages))
                detected_languages = list(set(detected_languages))

                yield page.id, wikitext_length, has_infobox, has_description_field, str(
                    languages), str(detected_languages)

    print("total files: " + str(number_of_files))


output = mysqltsv.Writer(
    open("data/sdoc/commonswiki_20171120_files_description.tsv", "w"),
    headers=[
        "page_id", "wikitext_length", "has_infobox", "has_description_field",
        "languages", "detected_languages"
    ])

for page_id, wikitext_length, has_infobox, has_description_field, languages, detected_languages in mwxml.map(
        process_dump, dump_files):
    output.write([
        page_id, wikitext_length, has_infobox, has_description_field,
        languages, detected_languages
    ])

# compress the tsv file
# tar -czvf commonswiki_20171120_files_description.tar.gz commonswiki_20171120_files_description.tsv

コード例 #20

0

ファイルを表示

					for n,sec  in enumerate(sections):
						if n+1 < len(sections):
							secText = text[sections[n][1]:sections[n+1][0]]
						else:
							secText = text[sections[n][1]:] #for the last sextion
						#print(sec)
						sectionContent[sections[n][2]] = Counter(secText.lower().split())
					idf = []
					[idf.extend(words.keys()) for sec,words in sectionContent.items()]
					idf = Counter(idf)
					weigthed = {}
					for sec,content in sectionContent.items():
						tfIdfSec = tfIdf(content,idf,sum(content.values()),N)
						weigthed[sec] = fasttextDistanceTfIdf(tfIdfSec,lang_dictionary)
			
					yield wikidatadict[page.title], weigthed
				#except:
					#pass

	f = open('multiLanguageFromDumpsFastextarticlesInSixLang/sections-articles_%s.json' % lang,'w')
#	writer =  csv.writer(f,delimiter='\t')
	for result in mwxml.map(process_dump, paths, threads = 168):
		try:
			q = result[0]
			secs = result[1]
			for sec,vector in secs.items():
				if not isinstance(vector,int):
					f.write(json.dumps([q,sec,vector.tolist()])+'\n')
		except Exception as e:
    			print('error: ',e)

コード例 #21

0

ファイルを表示

ファイル: extract_damaging.py プロジェクト: wiki-ai/editquality

def run(paths, session, start, end, revert_radius, revert_window,
        reverted_only, trusted_users, trusted_edits, rev_reverteds,
        check_blocked, verbose=False):

    def process_dump(dump, path):
        for page in dump:
            detector = mwreverts.Detector(radius=revert_radius)
            window = deque(maxlen=revert_radius)
            for revision in page:
                revision.maybe_damaging = None
                revision.reason = None
                revert = detector.process(revision.sha1, revision)

                if start and revision.timestamp < start:
                    continue
                if end and revision.timestamp > end:
                    continue
                window.append(revision)

                if revert is not None:
                    # A revert!
                    for reverted in revert.reverteds:
                        if (revision.timestamp - \
                            reverted.timestamp) <= revert_window and \
                           reverted.user is not None and revision.user is not None and \
                           reverted.user.text != revision.user.text and \
                           reverted.maybe_damaging is not False:
                            # Happened within the window
                            # wasn't a self revert and hasn't
                            # already been marked good.
                            reverted.maybe_damaging = True
                            reverted.reason = "Reverted by someone else"

                    if revert.reverted_to.maybe_damaging and \
                       revert.reverted_to.user.text != revision.user.text:
                        # Reverted back to my someone else.  Mark it good
                        # again.
                        revert.reverted_to.maybe_damaging = False
                        revert.reverted_to.reason = "Reverted back by " + \
                                                    "someone else"

                # Get user info
                load_user_data = trusted_edits or check_blocked
                if revision.user.id is not None and revision.user.id > 0 and \
                        load_user_data:
                    info = load_user_info(revision.user.text, session)
                else:
                    info = User(revision.user.id, 0, set())

                two_days_later = revision.timestamp + (60 * 60 * 24 * 2)
                if trusted_users and info.id in trusted_users:
                    revision.maybe_damaging = False
                    revision.reason = "In trusted group"
                elif check_blocked and user_recently_blocked(
                        revision.user.text, session, two_days_later):
                    # User was blocked. Edits may be damaging!
                    revision.maybe_damaging = True
                    revision.reason = "User was blocked from editing"
                elif trusted_edits and info.editcount >= trusted_edits:
                    revision.maybe_damaging = False
                    revision.reason = "Enough edits to be trusted"
                else:
                    revision.reason = "Unknown"

                if len(window) == revert_radius:
                    old_revision = window.popleft()
                    yield (old_revision.id, old_revision.maybe_damaging,
                           old_revision.reason)

            for old_revision in window:
                yield (old_revision.id, old_revision.maybe_damaging,
                       old_revision.reason)

    for rev_id, maybe_damaging, reason in mwxml.map(process_dump, paths):
        rev_reverteds.write([rev_id, bool(maybe_damaging), reason])
        if maybe_damaging:
            if reason and "Reverted" in reason:
                if verbose:
                    sys.stderr.write("r")
                    sys.stderr.flush()
            elif reason and "blocked" in reason:
                if verbose:
                    sys.stderr.write("b")
                    sys.stderr.flush()
            else:  # "Unknown" in reason:
                if verbose:
                    sys.stderr.write(".")
                    sys.stderr.flush()
        else:
            if reason and "edits" in reason:
                if verbose:
                    sys.stderr.write("e")
                    sys.stderr.flush()
            elif reason and "Unknown" in reason:
                if verbose:
                    sys.stderr.write(".")
                    sys.stderr.flush()
            else:  # "group" in reason:
                if verbose:
                    sys.stderr.write("g")
                    sys.stderr.flush()

    if verbose:
        sys.stderr.write("\n")