def extract(dump_files, extractors=ALL_EXTRACTORS): """ Extracts cites from a set of `dump_files`. :Parameters: dump_files : str | `file` A set of files MediaWiki XML dump files (expects: pages-meta-history) extractors : `list`(`extractor`) A list of extractors to apply to the text :Returns: `iterable` -- a generator of extracted cites """ # Dump processor function def process_dump(dump, path): for page in dump: if page.namespace != 0: continue else: for cite in extract_cite_history(page, extractors): yield cite # Map call return mwxml.map(process_dump, dump_files)
def analyse_dumps(dumps, cohorts, pages, threads=None): results = _init_cohort_contribs(cohorts) _partial = partial(_analyse_single_dump, cohorts=cohorts, pages=pages) for sub_res in mwxml.map(_partial, dumps, threads=threads): for cohort in sub_res: results[cohort].update(sub_res[cohort]) return results
def run(paths, model, sunset, score_at, rev_scores, skip_scores_before, processes, verbose=False): if score_at == "revision": process_dump = revision_scores(model, sunset, skip_scores_before) elif score_at == "latest": process_dump = latest_scores(model, sunset, skip_scores_before) else: sunset_year = int(sunset.strftime("%Y")) if score_at == "monthly": dates = chain(*(zip([year] * 12, MONTHS) for year in range(START_YEAR, sunset_year + 1))) thresholds = [ mwtypes.Timestamp(str(year) + month + "01000000") for year, month in dates ] elif score_at == "biannually": dates = chain(*(zip([year] * 2, ["01", "07"]) for year in range(START_YEAR, sunset_year + 1))) thresholds = [ mwtypes.Timestamp(str(year) + month + "01000000") for year, month in dates ] elif score_at == "annually": thresholds = [ mwtypes.Timestamp(str(year) + "0101000000") for year in range(START_YEAR, sunset_year + 1) ] else: raise RuntimeError( "{0} is not a valid 'score_at' value".format(score_at)) process_dump = threshold_scores(model, sunset, skip_scores_before, thresholds) results = mwxml.map(process_dump, paths, threads=processes) for page_id, title, rev_id, timestamp, (e, score) in results: if e is not None: logger.error("Error while processing {0}({1}) @ {2}: {3}".format( title, page_id, rev_id, e)) continue weighted_sum = sum(CLASS_WEIGHTS[cls] * score['probability'][cls] for cls in score['probability']) rev_scores.write([ page_id, title, rev_id, timestamp.short_format(), score['prediction'], weighted_sum ]) if verbose: sys.stderr.write(score['prediction'] + " ") sys.stderr.flush() if verbose: sys.stderr.write("\n")
def run(dump_files): def process_dump(dump, path): for page in dump: if page.namespace != 3: continue last_templates = defaultdict(lambda: 0) for revision in page: if revision.text is None: continue current_templates = defaultdict(lambda: 0) for template in subst_templates.extract(revision.text): current_templates[template] += 1 for template, count in current_templates.items(): diff = current_templates[template] - \ last_templates[template] if diff > 0: yield (page.id, page.namespace, page.title, revision.id, revision.timestamp, revision.comment, template, diff) current_templates = last_templates writer = mysqltsv.Writer(sys.stdout, headers=HEADERS) for values in mwxml.map(process_dump, dump_files): sys.stderr.write(".") writer.write(values) sys.stderr.write("\n")
def run(dump_paths, threads, output, verbose=False, extractor=None): if len(dump_paths) == 0: label_events = extract_labelings(mwxml.Dump.from_file(sys.stdin), extractor, verbose=verbose) else: label_events = mwxml.map(lambda d, p: extract_labelings(d, extractor, verbose), dump_paths, threads=threads) for labeling in label_events: dump_observation(labeling, output)
def run(dump_paths, threads, output, verbose=False): if len(dump_paths) == 0: user_edits = extract_rev_data(mwxml.Dump.from_file(sys.stdin), verbose=verbose) else: user_edits = mwxml.map(lambda d, p: extract_rev_data(d, verbose), dump_paths, threads=threads) for edit in user_edits: json.dump(edit, output) output.write("\n")
def run(page_periods, scorer_model, dump_paths): logging.basicConfig( level=logging.INFO, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) writer = mysqltsv.Writer(sys.stdout, headers=HEADERS) def process_dump(dump, path): for page in dump: if page.namespace != 0 or page.id not in page_periods: continue else: start_id, end_id = page_periods[page.id] #sys.stderr.write(page.title + ": ");sys.stderr.flush() pre_period_revision = None for revision in page: if revision.id < start_id: #sys.stderr.write(".");sys.stderr.flush() pre_period_revision = revision if revision.id == end_id: if pre_period_revision is not None: start_text = pre_period_revision.text start_id = pre_period_revision.id else: start_text = "" start_id = None start_score = generate_score(scorer_model, start_text) #sys.stderr.write("s1");sys.stderr.flush() end_score = generate_score(scorer_model, revision.text) #sys.stderr.write("s2");sys.stderr.flush() yield (page.id, pre_period_revision.id, start_score['prediction'], weighted_sum(start_score), revision.id, end_score['prediction'], weighted_sum(end_score)) break #sys.stderr.write("\n") for values in mwxml.map(process_dump, dump_paths): writer.write(values)
def run(paths, model, sunset, score_at, rev_scores, skip_scores_before, processes, verbose=False): if score_at == "revision": process_dump = revision_scores(model, sunset, skip_scores_before) elif score_at == "latest": process_dump = latest_scores(model, sunset, skip_scores_before) else: sunset_year = int(sunset.strftime("%Y")) if score_at == "monthly": dates = chain(*(zip([year] * 12, MONTHS) for year in range(START_YEAR, sunset_year + 1))) thresholds = [mwtypes.Timestamp(str(year) + month + "01000000") for year, month in dates] elif score_at == "biannually": dates = chain(*(zip([year] * 2, ["01", "07"]) for year in range(START_YEAR, sunset_year + 1))) thresholds = [mwtypes.Timestamp(str(year) + month + "01000000") for year, month in dates] elif score_at == "annually": thresholds = [mwtypes.Timestamp(str(year) + "0101000000") for year in range(START_YEAR, sunset_year + 1)] else: raise RuntimeError("{0} is not a valid 'score_at' value" .format(score_at)) process_dump = threshold_scores( model, sunset, skip_scores_before, thresholds) results = mwxml.map(process_dump, paths, threads=processes) for page_id, title, rev_id, timestamp, (e, score) in results: if e is not None: logger.error("Error while processing {0}({1}) @ {2}: {3}" .format(title, page_id, rev_id, e)) continue weighted_sum = sum(CLASS_WEIGHTS[cls] * score['probability'][cls] for cls in score['probability']) rev_scores.write( [page_id, title, rev_id, timestamp.short_format(), score['prediction'], weighted_sum]) if verbose: sys.stderr.write(score['prediction'] + " ") sys.stderr.flush() if verbose: sys.stderr.write("\n")
def run(dump_paths, page_labelings, output, threads, verbose=False): logging.basicConfig( level=logging.DEBUG if verbose else logging.WARNING, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) if len(dump_paths) == 0: labelings = extract_text(mwxml.Dump.from_file(sys.stdin), page_labelings, verbose=verbose) else: labelings = mwxml.map(lambda d, p: extract_text(d, page_labelings, verbose), dump_paths, threads=threads) for labeling in labelings: dump_observation(labeling, output)
def run(dump_paths, page_labelings, output, threads, verbose=False): logging.basicConfig( level=logging.DEBUG if verbose else logging.WARNING, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') if len(dump_paths) == 0: labelings = extract_text(mwxml.Dump.from_file(sys.stdin), page_labelings, verbose=verbose) else: labelings = mwxml.map(lambda d, p: \ extract_text(d, page_labelings, verbose), dump_paths, threads=threads) for labeling in labelings: dump_observation(labeling, output)
def run(dump_paths, threads, output, verbose=False, extractor=None): logging.basicConfig( level=logging.DEBUG if verbose else logging.WARNING, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) if len(dump_paths) == 0: label_events = dump2labels(mwxml.Dump.from_file(sys.stdin), extractor, verbose=verbose) else: label_events = mwxml.map(lambda d, p: \ dump2labels(d, extractor, verbose), dump_paths,threads=threads) for labeling in label_events: json.dump(labeling, output) output.write("\n")
def run(dump_files): def process_dump(dump, path): for page in dump: if page.namespace != 3: continue last_links = defaultdict(lambda: 0) for revision in page: if revision.text is None: continue current_links = defaultdict(lambda: 0) for link in teahouse_links.extract(revision.text): current_links[link] += 1 for link, count in current_links.items(): diff = current_links[link] - last_links[link] if diff > 0: yield ( page.id, page.namespace, page.title, revision.id, revision.timestamp, revision.comment, link, diff, ) last_links = current_links writer = mysqltsv.Writer(sys.stdout, headers=HEADERS) for values in mwxml.map(process_dump, dump_files): sys.stderr.write(".") writer.write(values) sys.stderr.write("\n")
def parse(src, dst): files = glob.glob(src) with open(dst, 'w', buffering=1000) as f: for r in mwxml.map(map_, files): f.write('\t'.join(map(str, r)) + '\n')
if page.redirect: page_title_rd = normalise_title(page.redirect) else: page_title_rd = None yield page_id, page_title, page_title_rd print("Done processing path:", path) # we get two dictionaries # pageids={page_title:page_id} ## only for non-redirect-pages # redirects={page_title:page_title_rd}, where page_title_rd is the redirected-to page-title ## onlöy for redirect mpages print('1st pass: getting pageids and redirects tables') pageids = {} redirects = {} pbar = tqdm() for page_id, page_title, page_title_rd, in mwxml.map( process_dump_pageids_redirects, paths, threads=threads): if page_title_rd == None: pageids[page_title] = page_id elif len(page_title_rd) > 0: redirects[page_title] = page_title_rd else: pass pbar.update(1) pbar.close() print('extraction done.') print("Number of pages", len(pageids)) print("Number of redirects", len(redirects)) ################################################ # dump iteration on each page to get links and anchors
def run(paths, session, start, end, revert_radius, revert_window, reverted_only, trusted_users, trusted_edits, rev_reverteds, check_blocked, verbose=False): def process_dump(dump, path): for page in dump: detector = mwreverts.Detector(radius=revert_radius) window = deque(maxlen=revert_radius) for revision in page: revision.maybe_damaging = None revision.reason = None revert = detector.process(revision.sha1, revision) if start and revision.timestamp < start: continue if end and revision.timestamp > end: continue window.append(revision) if revert is not None: # A revert! for reverted in revert.reverteds: if (revision.timestamp - reverted.timestamp) <= revert_window and \ reverted.user is not None and \ revision.user is not None and \ reverted.user.text != revision.user.text and \ reverted.maybe_damaging is not False: # Happened within the window # wasn't a self revert and hasn't # already been marked good. reverted.maybe_damaging = True reverted.reason = "Reverted by someone else" if revert.reverted_to.maybe_damaging and \ revert.reverted_to.user.text != revision.user.text: # Reverted back to my someone else. Mark it good # again. revert.reverted_to.maybe_damaging = False revert.reverted_to.reason = "Reverted back by " + \ "someone else" # Get user info load_user_data = trusted_edits or check_blocked if revision.user.id is not None and revision.user.id > 0 and \ load_user_data: info = load_user_info(revision.user.text, session) else: info = User(revision.user.id, 0, set()) two_days_later = revision.timestamp + (60 * 60 * 24 * 2) if trusted_users and info.id in trusted_users: revision.maybe_damaging = False revision.reason = "In trusted group" elif check_blocked and user_recently_blocked( revision.user.text, session, two_days_later): # User was blocked. Edits may be damaging! revision.maybe_damaging = True revision.reason = "User was blocked from editing" elif trusted_edits and info.editcount >= trusted_edits: revision.maybe_damaging = False revision.reason = "Enough edits to be trusted" else: revision.reason = "Unknown" if len(window) == revert_radius: old_revision = window.popleft() yield (old_revision.id, old_revision.maybe_damaging, old_revision.reason) for old_revision in window: yield (old_revision.id, old_revision.maybe_damaging, old_revision.reason) for rev_id, maybe_damaging, reason in mwxml.map(process_dump, paths): rev_reverteds.write([rev_id, bool(maybe_damaging), reason]) if maybe_damaging: if reason and "Reverted" in reason: if verbose: sys.stderr.write("r") sys.stderr.flush() elif reason and "blocked" in reason: if verbose: sys.stderr.write("b") sys.stderr.flush() else: # "Unknown" in reason: if verbose: sys.stderr.write(".") sys.stderr.flush() else: if reason and "edits" in reason: if verbose: sys.stderr.write("e") sys.stderr.flush() elif reason and "Unknown" in reason: if verbose: sys.stderr.write(".") sys.stderr.flush() else: # "group" in reason: if verbose: sys.stderr.write("g") sys.stderr.flush() if verbose: sys.stderr.write("\n")
################## # Global variables # TODO: if necessary, parametrize the script to take these as input count = 0 # LIMIT_SENTS = 200000 ## see above # Storage array of sentences wiki_links = [] ################## # mwxml parallelism # this might apply to english only # maybe it's possible to split other dumps to part-files pbar = tqdm(total=LIMIT_SENTS) for title, sentence in mwxml.map(process_dump, paths, threads=10): wiki_links.append((title, sentence)) count += 1 if count >= LIMIT_SENTS: break pbar.update(1) pbar.close() print('number of sentences extracted', len(wiki_links)) ################## # store two files # training: to be used by the generate_training_data.py, because in the future we might want to use the whole sentence. # test: this is effectively the data used by the backtesting protocol. LIMIT_SENTS_SPLIT = len(wiki_links) // 2 print(LIMIT_SENTS_SPLIT)
def run(input_files, revisions_output_file, verbose): def process_pages(stub_file_dump_object, file_url): for stub_file_page in stub_file_dump_object: for stub_file_page_revision in stub_file_page: revision_comment = stub_file_page_revision.comment revision_user_id = "NULL" revision_user_text = "NULL" if revision_comment is None: revision_comment = "NULL" if stub_file_page_revision.user is None: logger.warning( "No user. Fields will be NULL. Revision: {0}" .format(stub_file_page_revision)) revision_user_id = "NULL" revision_user_text = "NULL" elif stub_file_page_revision.user.id is None: revision_user_id = "NULL" revision_user_text = stub_file_page_revision.user.text else: revision_user_id = stub_file_page_revision.user.id revision_user_text = stub_file_page_revision.user.text timestamp =\ datetime.datetime.\ utcfromtimestamp(stub_file_page_revision.timestamp) cast_timestamp =\ str(timestamp.year).zfill(4) +\ str(timestamp.month).zfill(2) +\ str(timestamp.day).zfill(2) +\ str(timestamp.hour).zfill(2) +\ str(timestamp.minute).zfill(2) +\ str(timestamp.second).zfill(2) yield stub_file_page_revision.page.title,\ stub_file_page_revision.id,\ revision_user_id,\ revision_user_text,\ revision_comment,\ stub_file_page.namespace,\ cast_timestamp i = 0 for title, revision_id, user_id, user_text, comment, namespace, timestamp\ in mwxml.map(process_pages, input_files): i += 1 revisions_output_file.write([title, revision_id, user_id, user_text, comment, namespace, timestamp]) if verbose and i % 10000 == 0: sys.stderr.write("Revisions processed: {0}\n".format(i)) sys.stderr.flush() if verbose: sys.stderr.write("Completed writing out result file\n") sys.stderr.flush()
################## # Global variables # TODO: if necessary, parametrize the script to take these as input count = 0 LIMIT_SENTS = 200000 LIMIT_SENTS_SPLIT = LIMIT_SENTS // 2 # Storage array of sentences wiki_links = [] ################## # mwxml parallelism # this might apply to english only # maybe it's possible to split other dumps to part-files pbar = tqdm(total=LIMIT_SENTS) for title, sentence in mwxml.map(process_dump, paths): wiki_links.append((title, sentence)) count += 1 if count >= LIMIT_SENTS: break pbar.update(1) pbar.close() print('number of sentences extracted', len(wiki_links)) ################## # store two files # training: to be used by the generate_training_data.py, because in the future we might want to use the whole sentence. # test: this is effectively the data used by the backtesting protocol. # Store the sentences for training with open("../../data/en/training/sentences_train.csv", "w") as f:
process_language(detect( wikicode.strip_code().strip()))) except: pass languages = list(set(languages)) detected_languages = list(set(detected_languages)) yield page.id, wikitext_length, has_infobox, has_description_field, str( languages), str(detected_languages) print("total files: " + str(number_of_files)) output = mysqltsv.Writer( open("data/sdoc/commonswiki_20171120_files_description.tsv", "w"), headers=[ "page_id", "wikitext_length", "has_infobox", "has_description_field", "languages", "detected_languages" ]) for page_id, wikitext_length, has_infobox, has_description_field, languages, detected_languages in mwxml.map( process_dump, dump_files): output.write([ page_id, wikitext_length, has_infobox, has_description_field, languages, detected_languages ]) # compress the tsv file # tar -czvf commonswiki_20171120_files_description.tar.gz commonswiki_20171120_files_description.tsv
for n,sec in enumerate(sections): if n+1 < len(sections): secText = text[sections[n][1]:sections[n+1][0]] else: secText = text[sections[n][1]:] #for the last sextion #print(sec) sectionContent[sections[n][2]] = Counter(secText.lower().split()) idf = [] [idf.extend(words.keys()) for sec,words in sectionContent.items()] idf = Counter(idf) weigthed = {} for sec,content in sectionContent.items(): tfIdfSec = tfIdf(content,idf,sum(content.values()),N) weigthed[sec] = fasttextDistanceTfIdf(tfIdfSec,lang_dictionary) yield wikidatadict[page.title], weigthed #except: #pass f = open('multiLanguageFromDumpsFastextarticlesInSixLang/sections-articles_%s.json' % lang,'w') # writer = csv.writer(f,delimiter='\t') for result in mwxml.map(process_dump, paths, threads = 168): try: q = result[0] secs = result[1] for sec,vector in secs.items(): if not isinstance(vector,int): f.write(json.dumps([q,sec,vector.tolist()])+'\n') except Exception as e: print('error: ',e)
def run(paths, session, start, end, revert_radius, revert_window, reverted_only, trusted_users, trusted_edits, rev_reverteds, check_blocked, verbose=False): def process_dump(dump, path): for page in dump: detector = mwreverts.Detector(radius=revert_radius) window = deque(maxlen=revert_radius) for revision in page: revision.maybe_damaging = None revision.reason = None revert = detector.process(revision.sha1, revision) if start and revision.timestamp < start: continue if end and revision.timestamp > end: continue window.append(revision) if revert is not None: # A revert! for reverted in revert.reverteds: if (revision.timestamp - \ reverted.timestamp) <= revert_window and \ reverted.user is not None and revision.user is not None and \ reverted.user.text != revision.user.text and \ reverted.maybe_damaging is not False: # Happened within the window # wasn't a self revert and hasn't # already been marked good. reverted.maybe_damaging = True reverted.reason = "Reverted by someone else" if revert.reverted_to.maybe_damaging and \ revert.reverted_to.user.text != revision.user.text: # Reverted back to my someone else. Mark it good # again. revert.reverted_to.maybe_damaging = False revert.reverted_to.reason = "Reverted back by " + \ "someone else" # Get user info load_user_data = trusted_edits or check_blocked if revision.user.id is not None and revision.user.id > 0 and \ load_user_data: info = load_user_info(revision.user.text, session) else: info = User(revision.user.id, 0, set()) two_days_later = revision.timestamp + (60 * 60 * 24 * 2) if trusted_users and info.id in trusted_users: revision.maybe_damaging = False revision.reason = "In trusted group" elif check_blocked and user_recently_blocked( revision.user.text, session, two_days_later): # User was blocked. Edits may be damaging! revision.maybe_damaging = True revision.reason = "User was blocked from editing" elif trusted_edits and info.editcount >= trusted_edits: revision.maybe_damaging = False revision.reason = "Enough edits to be trusted" else: revision.reason = "Unknown" if len(window) == revert_radius: old_revision = window.popleft() yield (old_revision.id, old_revision.maybe_damaging, old_revision.reason) for old_revision in window: yield (old_revision.id, old_revision.maybe_damaging, old_revision.reason) for rev_id, maybe_damaging, reason in mwxml.map(process_dump, paths): rev_reverteds.write([rev_id, bool(maybe_damaging), reason]) if maybe_damaging: if reason and "Reverted" in reason: if verbose: sys.stderr.write("r") sys.stderr.flush() elif reason and "blocked" in reason: if verbose: sys.stderr.write("b") sys.stderr.flush() else: # "Unknown" in reason: if verbose: sys.stderr.write(".") sys.stderr.flush() else: if reason and "edits" in reason: if verbose: sys.stderr.write("e") sys.stderr.flush() elif reason and "Unknown" in reason: if verbose: sys.stderr.write(".") sys.stderr.flush() else: # "group" in reason: if verbose: sys.stderr.write("g") sys.stderr.flush() if verbose: sys.stderr.write("\n")