def bot_gen(rev_pages, language, api_url): session = api.Session(api_url) extractor = APIExtractor(session, language=language) for rev_id, page_id in rev_pages: sys.stderr.write(".") sys.stderr.flush() try: # Detect reverted status revert = reverts.api.check(session, rev_id, page_id, radius=3) reverted = revert is not None added_words = list( extractor.extract(rev_id, [diff.added_words]))[0] yield Edit(rev_id, added_words, reverted) except KeyboardInterrupt: sys.stderr.write("\n^C Caught. Exiting...") break except: sys.stderr.write(traceback.format_exc()) sys.stderr.write("\n") sys.stderr.write("\n")
def bot_gen(rev_pages, language, api_url): session = api.Session(api_url) extractor = APIExtractor(session, language=language) for rev_id, page_id in rev_pages: sys.stderr.write(".") sys.stderr.flush() try: # Detect reverted status revert = reverts.api.check(session, rev_id, page_id, radius=3) reverted = revert is not None added_words = list(extractor.extract(rev_id, [diff.added_words]))[0] yield Edit(rev_id, added_words, reverted) except KeyboardInterrupt: sys.stderr.write("\n^C Caught. Exiting...") break except: sys.stderr.write(traceback.format_exc()) sys.stderr.write("\n") sys.stderr.write("\n")
import mwapi from revscoring import ScorerModel from revscoring.extractors import APIExtractor with open("models/enwiki.damaging.linear_svc.model") as f: scorer_model = ScorerModel.load(f) extractor = APIExtractor(mwapi.Session(host="https://en.wikipedia.org", user_agent="revscoring demo")) feature_values = extractor.extract(123456789, scorer_model.features) print(scorer_model.score(feature_values))
import mwapi from revscoring import ScorerModel from revscoring.extractors import APIExtractor with open("models/enwiki.damaging.linear_svc.model") as f: scorer_model = ScorerModel.load(f) extractor = APIExtractor( mwapi.Session(host="https://en.wikipedia.org", user_agent="revscoring demo")) feature_values = extractor.extract(123456789, scorer_model.features) print(scorer_model.score(feature_values))
count += 1 else: break """ Feature to examine. Let FEATURE be one of diff.added_tokens diff.removed_tokens diff.added_segments diff.removed_segments revision.content revision.content_tokens """ FEATURE = diff.added_segments # extract data from selected revisions and write to selected file extr = APIExtractor(mwapi.Session("https://en.wikipedia.org")) for id in rev_ids: data = extr.extract(id, FEATURE) dump_target.write("\n\nBeginning %s of revision %d\n\n" % (FEATURE, id)) if type(data) is str: dump_target.write(data) elif type(data) is list: dump_target.writelines(data) else: print("Unknown Type") exit() dump_target.write("\n\nEnd %s of revision %d" % (FEATURE, id))
revision.day_of_week, revision.has_custom_comment, revision.has_section_comment, revision.hour_of_day, revision.image_links, revision.infobox_templates, revision.content_chars, revision.infonoise, revision.internal_links, revision.level_1_headings, revision.level_2_headings, revision.level_3_headings, revision.level_4_headings, revision.level_5_headings, revision.level_6_headings, revision.markup_chars, revision.misspellings, revision.numeric_chars, revision.proportion_of_badwords, revision.proportion_of_markup_chars, revision.proportion_of_misspellings, revision.proportion_of_numeric_chars, revision.proportion_of_symbolic_chars, revision.proportion_of_templated_references, revision.proportion_of_uppercase_chars, revision.ref_tags, revision.symbolic_chars, revision.templates, revision.uppercase_chars, revision.words, user.age, user.is_anon, user.is_bot] print("Extracting {0} features for ".format(len(features)) + "https://pt.wikipedia.org/w/index.php?diff=4083720") values = api_extractor.extract(4083720, features) for feature, value in zip(features, values): print("{0}: {1}".format(feature, value)) sys.stdout.flush()
from mw.api import Session from revscoring.extractors import APIExtractor from revscoring.features import diff, parent_revision, revision, user api_extractor = APIExtractor(Session("https://en.wikipedia.org/w/api.php")) features = [revision.day_of_week, revision.hour_of_day, revision.has_custom_comment, diff.bytes_changed, diff.chars_added, user.age, user.is_anon, user.is_bot] values = api_extractor.extract( 624577024, features ) for feature, value in zip(features, values): print("{0}: {1}".format(feature, value))
parent_revision.proportion_of_uppercase_chars, parent_revision.revision_bytes, parent_revision.seconds_since, parent_revision.symbolic_chars, parent_revision.uppercase_chars, parent_revision.was_same_user, parent_revision.words, previous_user_revision.seconds_since, revision.badwords, revision.bytes, revision.category_links, revision.chars, revision.cite_templates, revision.day_of_week, revision.has_custom_comment, revision.has_section_comment, revision.hour_of_day, revision.image_links, revision.infobox_templates, revision.content_chars, revision.infonoise, revision.internal_links, revision.level_1_headings, revision.level_2_headings, revision.level_3_headings, revision.level_4_headings, revision.level_5_headings, revision.level_6_headings, revision.markup_chars, revision.misspellings, revision.numeric_chars, revision.proportion_of_badwords, revision.proportion_of_markup_chars, revision.proportion_of_misspellings, revision.proportion_of_numeric_chars, revision.proportion_of_symbolic_chars, revision.proportion_of_templated_references, revision.proportion_of_uppercase_chars, revision.ref_tags, revision.symbolic_chars, revision.templates, revision.uppercase_chars, revision.words, user.age, user.is_anon, user.is_bot ] print("Extracting {0} features for ".format(len(features)) + "https://pt.wikipedia.org/w/index.php?diff=4083720") values = api_extractor.extract(4083720, features) for feature, value in zip(features, values): print("{0}: {1}".format(feature, value)) sys.stdout.flush()