def bot_gen(rev_pages, language, api_url):

    session = api.Session(api_url)
    extractor = APIExtractor(session, language=language)

    for rev_id, page_id in rev_pages:
        sys.stderr.write(".")
        sys.stderr.flush()
        try:

            # Detect reverted status
            revert = reverts.api.check(session, rev_id, page_id, radius=3)
            reverted = revert is not None
            added_words = list(
                extractor.extract(rev_id, [diff.added_words]))[0]
            yield Edit(rev_id, added_words, reverted)

        except KeyboardInterrupt:
            sys.stderr.write("\n^C Caught.  Exiting...")
            break

        except:
            sys.stderr.write(traceback.format_exc())
            sys.stderr.write("\n")

    sys.stderr.write("\n")
def bot_gen(rev_pages, language, api_url):

    session = api.Session(api_url)
    extractor = APIExtractor(session, language=language)

    for rev_id, page_id in rev_pages:
        sys.stderr.write(".")
        sys.stderr.flush()
        try:

            # Detect reverted status
            revert = reverts.api.check(session, rev_id, page_id, radius=3)
            reverted = revert is not None
            added_words = list(extractor.extract(rev_id,
                                                 [diff.added_words]))[0]
            yield Edit(rev_id, added_words, reverted)

        except KeyboardInterrupt:
            sys.stderr.write("\n^C Caught.  Exiting...")
            break

        except:
            sys.stderr.write(traceback.format_exc())
            sys.stderr.write("\n")

    sys.stderr.write("\n")
Example #3
0
import mwapi
from revscoring import ScorerModel
from revscoring.extractors import APIExtractor

with open("models/enwiki.damaging.linear_svc.model") as f:
    scorer_model = ScorerModel.load(f)

extractor = APIExtractor(mwapi.Session(host="https://en.wikipedia.org",
                                       user_agent="revscoring demo"))

feature_values = extractor.extract(123456789, scorer_model.features)

print(scorer_model.score(feature_values))
Example #4
0
import mwapi
from revscoring import ScorerModel
from revscoring.extractors import APIExtractor

with open("models/enwiki.damaging.linear_svc.model") as f:
    scorer_model = ScorerModel.load(f)

extractor = APIExtractor(
    mwapi.Session(host="https://en.wikipedia.org",
                  user_agent="revscoring demo"))

feature_values = extractor.extract(123456789, scorer_model.features)

print(scorer_model.score(feature_values))
Example #5
0
        count += 1
    else:
        break

"""
Feature to examine.  Let FEATURE be one of
diff.added_tokens
diff.removed_tokens
diff.added_segments
diff.removed_segments
revision.content
revision.content_tokens
"""
FEATURE = diff.added_segments


# extract data from selected revisions and write to selected file
extr = APIExtractor(mwapi.Session("https://en.wikipedia.org"))
for id in rev_ids:

    data = extr.extract(id, FEATURE)
    dump_target.write("\n\nBeginning %s of revision %d\n\n" % (FEATURE, id))
    if type(data) is str:
        dump_target.write(data)
    elif type(data) is list:
        dump_target.writelines(data)
    else:
        print("Unknown Type")
        exit()
    dump_target.write("\n\nEnd %s of revision %d" % (FEATURE, id))
Example #6
0
            revision.day_of_week, revision.has_custom_comment,
            revision.has_section_comment, revision.hour_of_day,
            revision.image_links, revision.infobox_templates,
            revision.content_chars,
            revision.infonoise, revision.internal_links,
            revision.level_1_headings,
            revision.level_2_headings,
            revision.level_3_headings,
            revision.level_4_headings,
            revision.level_5_headings,
            revision.level_6_headings,
            revision.markup_chars, revision.misspellings,
            revision.numeric_chars, revision.proportion_of_badwords,
            revision.proportion_of_markup_chars,
            revision.proportion_of_misspellings,
            revision.proportion_of_numeric_chars,
            revision.proportion_of_symbolic_chars,
            revision.proportion_of_templated_references,
            revision.proportion_of_uppercase_chars,
            revision.ref_tags, revision.symbolic_chars,
            revision.templates, revision.uppercase_chars, revision.words,
            user.age, user.is_anon, user.is_bot]

print("Extracting {0} features for ".format(len(features))  +
      "https://pt.wikipedia.org/w/index.php?diff=4083720")
values = api_extractor.extract(4083720, features)

for feature, value in zip(features, values):
    print("{0}: {1}".format(feature, value))
    sys.stdout.flush()
Example #7
0
from mw.api import Session
from revscoring.extractors import APIExtractor
from revscoring.features import diff, parent_revision, revision, user

api_extractor = APIExtractor(Session("https://en.wikipedia.org/w/api.php"))

features = [revision.day_of_week,
            revision.hour_of_day,
            revision.has_custom_comment,
            diff.bytes_changed,
            diff.chars_added,
            user.age,
            user.is_anon,
            user.is_bot]

values = api_extractor.extract(
    624577024,
    features
)
for feature, value in zip(features, values):
    print("{0}: {1}".format(feature, value))
Example #8
0
    parent_revision.proportion_of_uppercase_chars,
    parent_revision.revision_bytes, parent_revision.seconds_since,
    parent_revision.symbolic_chars, parent_revision.uppercase_chars,
    parent_revision.was_same_user, parent_revision.words,
    previous_user_revision.seconds_since, revision.badwords, revision.bytes,
    revision.category_links, revision.chars, revision.cite_templates,
    revision.day_of_week, revision.has_custom_comment,
    revision.has_section_comment, revision.hour_of_day, revision.image_links,
    revision.infobox_templates, revision.content_chars, revision.infonoise,
    revision.internal_links, revision.level_1_headings,
    revision.level_2_headings, revision.level_3_headings,
    revision.level_4_headings, revision.level_5_headings,
    revision.level_6_headings, revision.markup_chars, revision.misspellings,
    revision.numeric_chars, revision.proportion_of_badwords,
    revision.proportion_of_markup_chars, revision.proportion_of_misspellings,
    revision.proportion_of_numeric_chars,
    revision.proportion_of_symbolic_chars,
    revision.proportion_of_templated_references,
    revision.proportion_of_uppercase_chars, revision.ref_tags,
    revision.symbolic_chars, revision.templates, revision.uppercase_chars,
    revision.words, user.age, user.is_anon, user.is_bot
]

print("Extracting {0} features for ".format(len(features)) +
      "https://pt.wikipedia.org/w/index.php?diff=4083720")
values = api_extractor.extract(4083720, features)

for feature, value in zip(features, values):
    print("{0}: {1}".format(feature, value))
    sys.stdout.flush()