def test_log():
    log_five = modifiers.log(5)

    assert solve(log_five) == math_log(5)

    assert solve(pickle.loads(pickle.dumps(log_five))) == math_log(5)

    assert repr(log_five) == "<feature.log(5)>"
def test_trim():

    d1 = Datasource("derp1")
    f1 = Feature("foobar1", returns=int)
    f2 = Feature("foobar2", returns=int, depends_on=[d1])
    c = Constant(value=5)
    fv = FeatureVector("foobar3", returns=int, depends_on=[c])

    assert list(trim(f1)) == [f1]
    assert list(trim([f1, f2, fv])) == [f1, f2, fv]
    assert (list(trim(log(max(f1 - f2, 1)))) == [f1, f2])
Exemple #3
0
def test_trim():

    d1 = Datasource("derp1")
    f1 = Feature("foobar1", returns=int)
    f2 = Feature("foobar2", returns=int, depends_on=[d1])
    c = Constant(value=5)
    fv = FeatureVector("foobar3", returns=int, depends_on=[c])

    assert list(trim(f1)) == [f1]
    assert list(trim([f1, f2, fv])) == [f1, f2, fv]
    assert (list(trim(log(max(f1 - f2, 1)))) ==
            [f1, f2])
Exemple #4
0
from revscoring.features import Feature, wikitext
from revscoring.features.modifiers import div, log, max, sub


def _process_new_longest(p_longest, r_longest):
    if r_longest > p_longest:
        return r_longest
    else:
        return 1


parent = [
    log(wikitext.revision.parent.chars + 1),
    log(wikitext.revision.parent.tokens + 1),
    log(wikitext.revision.parent.words + 1),
    log(wikitext.revision.parent.uppercase_words + 1),
    log(wikitext.revision.parent.headings + 1),
    log(wikitext.revision.parent.wikilinks + 1),
    log(wikitext.revision.parent.external_links + 1),
    log(wikitext.revision.parent.templates + 1),
    log(wikitext.revision.parent.ref_tags + 1),
    div(wikitext.revision.parent.chars,
        max(wikitext.revision.parent.words, 1),
        name="revision.parent.chars_per_word"),
    div(wikitext.revision.parent.words,
        max(wikitext.revision.parent.tokens, 1),
        name="revision.parent.words_per_token"),
    div(wikitext.revision.parent.uppercase_words,
        max(wikitext.revision.parent.words, 1),
        name="revision.parent.uppercase_words_per_word"),
    div(wikitext.revision.parent.markups,
Exemple #5
0
from revscoring.features import (
    diff, page, parent_revision, previous_user_revision, revision, user
)
from revscoring.features.modifiers import log
from . import generic

damaging = generic.damaging + [
    log(max(diff.added_badwords_ratio + 1,1)),
    log(max(diff.added_misspellings_ratio + 1,1)),
    log(max(diff.badwords_added + 1,1)),
    log(max(diff.badwords_removed + 1,1)),
    log(max(diff.misspellings_added + 1,1)),
    log(max(diff.misspellings_removed + 1,1)),
    log(max(diff.proportion_of_badwords_added + 1,1)),
    log(max(diff.proportion_of_badwords_removed + 1,1)),
    log(max(diff.proportion_of_misspellings_added + 1,1)),
    log(max(diff.proportion_of_misspellings_removed + 1,1)),
    log(max(diff.removed_badwords_ratio + 1,1)),
    log(max(diff.removed_misspellings_ratio + 1,1)),
    log(max(parent_revision.badwords + 1,1)),
    log(max(parent_revision.misspellings + 1,1)),
    log(max(parent_revision.proportion_of_badwords + 1,1)),
    log(max(parent_revision.proportion_of_misspellings + 1,1)),
    log(max(revision.badwords + 1,1)),
    log(max(revision.misspellings + 1,1)),
    log(max(revision.proportion_of_badwords + 1,1)),
    log(max(revision.proportion_of_misspellings + 1,1)),
    log(revision.infonoise + 1)

]
Exemple #6
0
from revscoring.features import diff, page, parent_revision, user
from revscoring.features.modifiers import log

no_lang_damaging = [
    log(diff.added_symbolic_chars_ratio + 1),
    log(diff.chars_added + 1),
    log(diff.chars_removed + 1),
    diff.longest_repeated_char_added,
    diff.longest_token_added,
    log(diff.markup_chars_added + 1),
    log(diff.markup_chars_removed + 1),
    log(diff.numeric_chars_added + 1),
    log(diff.numeric_chars_removed + 1),
    diff.proportion_of_chars_added,
    diff.proportion_of_chars_removed,
    diff.proportion_of_markup_chars_added,
    diff.proportion_of_numeric_chars_added,
    diff.proportion_of_symbolic_chars_added,
    diff.proportion_of_uppercase_chars_added,
    log(diff.segments_added + 1),
    log(diff.segments_removed + 1),
    log(diff.symbolic_chars_added + 1),
    log(diff.symbolic_chars_removed + 1),
    log(diff.uppercase_chars_added + 1),
    log(diff.uppercase_chars_removed + 1),
    diff.bytes_changed + 1,
    diff.bytes_changed_ratio,
    page.is_content_namespace,
    parent_revision.was_same_user,
    user.is_bot
]
Exemple #7
0
    category_links / max(wikitext.revision.content_chars, 1),
    cite_templates,
    cite_templates / max(wikitext.revision.content_chars, 1),
    proportion_of_templated_references,
    non_templated_references,
    non_templated_references / max(wikitext.revision.content_chars, 1),
    non_cite_templates,
    non_cite_templates / max(wikitext.revision.content_chars, 1),
    infobox_templates,
    cn_templates + 1,
    cn_templates / max(wikitext.revision.content_chars, 1),
    who_templates + 1,
    who_templates / max(wikitext.revision.content_chars, 1),
    main_article_templates,
    main_article_templates / max(wikitext.revision.content_chars, 1),
    (english.stemmed.revision.stem_chars /
     max(wikitext.revision.content_chars, 1)),
    log(paragraphs_without_refs_total_length + 1),
]

wp10 = wikipedia.article + local_wiki
"""
Based largely on work by Morten Warncke-Wang et al.[1] and with a few
improvements and extensions that Morten identified after publication.

1. Warncke-Wang, M., Cosley, D., & Riedl, J. (2013, August). Tell me more: An
   actionable quality model for wikipedia. In Proceedings of the 9th
   International Symposium on Open Collaboration (p. 8). ACM.
   http://opensym.org/wsos2013/proceedings/p0202-warncke.pdf
"""
Exemple #8
0
                                 max(french.diff.words_removed, 1)
proportion_of_misspellings_added = french.diff.misspellings_added / \
                                   max(french.diff.words_added, 1)
proportion_of_misspellings_removed = french.diff.misspellings_removed / \
                                     max(french.diff.words_removed, 1)

proportion_of_badwords = french.parent_revision.badwords / \
                         max(french.parent_revision.words, 1)
proportion_of_misspellings = french.parent_revision.misspellings / \
                             max(french.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / \
                           max(proportion_of_misspellings, 0.01)

damaging = util.no_lang_damaging + enwiki.badwords + enwiki.informals + [
    log(french.diff.badwords_added + 1),
    log(french.diff.badwords_removed + 1),
    log(french.diff.misspellings_added + 1),
    log(french.diff.misspellings_removed + 1),
    proportion_of_badwords_added,
    proportion_of_badwords_removed,
    proportion_of_misspellings_added,
    proportion_of_misspellings_removed,
    added_badwords_ratio,
    added_misspellings_ratio
]

goodfaith = damaging
Exemple #9
0
from revscoring.features import wikibase
from revscoring.features.modifiers import log

item = [
    log(wikibase.revision.claims + 1),
    log(wikibase.revision.properties + 1),
    log(wikibase.revision.aliases + 1),
    log(wikibase.revision.sources + 1),
    log(wikibase.revision.qualifiers + 1),
    log(wikibase.revision.badges + 1),
    log(wikibase.revision.labels + 1),
    log(wikibase.revision.sitelinks + 1),
    log(wikibase.revision.descriptions + 1)
]
Exemple #10
0
proportion_of_informals_removed = estonian.diff.informals_removed / max(estonian.diff.words_removed, 1)

proportion_of_badwords = estonian.parent_revision.badwords / max(estonian.parent_revision.words, 1)
proportion_of_misspellings = estonian.parent_revision.misspellings / max(estonian.parent_revision.words, 1)
proportion_of_informals = estonian.parent_revision.informals / max(estonian.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / max(proportion_of_misspellings, 0.01)
added_informals_ratio = proportion_of_informals_added / max(proportion_of_informals, 0.01)

damaging = (
    util.no_lang_damaging
    + enwiki.badwords
    + enwiki.informals
    + [
        log(estonian.diff.badwords_added + 1),
        log(estonian.diff.badwords_removed + 1),
        log(estonian.diff.informals_added + 1),
        log(estonian.diff.informals_removed + 1),
        log(estonian.diff.misspellings_added + 1),
        log(estonian.diff.misspellings_removed + 1),
        proportion_of_badwords_added,
        proportion_of_badwords_removed,
        proportion_of_informals_added,
        proportion_of_informals_removed,
        proportion_of_misspellings_added,
        proportion_of_misspellings_removed,
        added_badwords_ratio,
        added_informals_ratio,
        added_misspellings_ratio,
    ]
Exemple #11
0
#    log(revscoring.features.diff.numeric_chars_added + 1),
#    log(revscoring.features.diff.numeric_chars_removed + 1),
#    revscoring.features.diff.proportion_of_chars_added,
#    revscoring.features.diff.proportion_of_chars_removed,
#    revscoring.features.diff.proportion_of_numeric_chars_added,
#    revscoring.features.diff.proportion_of_symbolic_chars_added,
#    revscoring.features.diff.proportion_of_uppercase_chars_added,
#    log(revscoring.features.diff.symbolic_chars_added + 1),
#    log(revscoring.features.diff.symbolic_chars_removed + 1),
#    log(revscoring.features.diff.uppercase_chars_added + 1),
#    log(revscoring.features.diff.uppercase_chars_removed + 1),
#    revscoring.features.diff.bytes_changed + 1,
#    revscoring.featuresdiff.bytes_changed_ratio,
#    page.is_content_namespace,
#    parent_revision.was_same_user,
    log(user.age + 1),
    diff.number_added_sitelinks,
    diff.number_removed_sitelinks,
    diff.number_changed_sitelinks,
    diff.number_added_labels,
    diff.number_removed_labels,
    diff.number_changed_labels,
    diff.number_added_descriptions,
    diff.number_removed_descriptions,
    diff.number_changed_descriptions,
    diff.number_added_aliases,
    diff.number_removed_aliases,
    diff.number_added_claims,
    diff.number_removed_claims,
    diff.number_changed_claims,
    diff.number_changed_identifiers,
Exemple #12
0
    revision_oriented.revision.user.in_group({'bot'},
                                             name="revision.user.is_bot"),
    revision_oriented.revision.user.in_group(
        {'checkuser', 'bureaucrat', 'oversight', 'steward'},
        name="revision.user.has_advanced_rights"),
    revision_oriented.revision.user.in_group({'sysop'},
                                             name="revision.user.is_admin"),
    revision_oriented.revision.user.in_group(
        {
            'browsearchive', 'deletedhistory', 'interface-editor',
            'noratelimit', 'accountcreator', 'massmessage-sender',
            'templateeditor', 'autopatrolled', 'propertycreator',
            'centralnoticeadmin'
        },
        name="revision.user.is_trusted"),
    revision_oriented.revision.user.in_group(
        {
            'rollback', 'abusefilter', 'patroller', 'reviewer', 'autoreview',
            'autoreviewer', 'editor', 'autoeditor', 'eliminator'
        },
        name="revision.user.is_patroller"),
    revision_oriented.revision.user.in_group(
        {'import', 'filemover', 'suppressredirect'},
        name="revision.user.is_curator")
]

protected_user = [
    revision_oriented.revision.user.is_anon,
    log(temporal.revision.user.seconds_since_registration + 1)
]
Exemple #13
0
]


user_rights = [
    revision_oriented.revision.user.in_group(
        {'bot'}, name="revision.user.is_bot"),
    revision_oriented.revision.user.in_group(
        {'checkuser', 'bureaucrat', 'oversight', 'steward'},
        name="revision.user.has_advanced_rights"),
    revision_oriented.revision.user.in_group(
        {'sysop'}, name="revision.user.is_admin"),
    revision_oriented.revision.user.in_group(
        {'browsearchive', 'deletedhistory', 'interface-editor',
         'noratelimit', 'accountcreator', 'massmessage-sender', 'templateeditor',
         'autopatrolled', 'propertycreator', 'centralnoticeadmin'}, 
        name="revision.user.is_trusted"),
    revision_oriented.revision.user.in_group(
        {'rollback', 'abusefilter', 'patroller', 
         'reviewer', 'autoreview', 'autoreviewer', 'editor', 'autoeditor', 
         'eliminator'},
        name="revision.user.is_patroller"),
    revision_oriented.revision.user.in_group(
        {'import', 'filemover', 'suppressredirect'},
        name="revision.user.is_curator")
]

protected_user = [
    revision_oriented.revision.user.is_anon,
    log(temporal.revision.user.seconds_since_registration + 1)
]
Exemple #14
0
proportion_of_badwords = hebrew.parent_revision.badwords / \
                         max(hebrew.parent_revision.words, 1)
proportion_of_misspellings = hebrew.parent_revision.misspellings / \
                             max(hebrew.parent_revision.words, 1)
proportion_of_informals = hebrew.parent_revision.informals / \
                          max(hebrew.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / \
                           max(proportion_of_misspellings, 0.01)
added_informals_ratio = proportion_of_informals_added / \
                        max(proportion_of_informals, 0.01)

damaging = util.no_lang_damaging + enwiki.badwords + enwiki.informals + [
    log(hebrew.diff.badwords_added + 1),
    log(hebrew.diff.badwords_removed + 1),
    log(hebrew.diff.informals_added + 1),
    log(hebrew.diff.informals_removed + 1),
    log(hebrew.diff.misspellings_added + 1),
    log(hebrew.diff.misspellings_removed + 1),
    proportion_of_badwords_added,
    proportion_of_badwords_removed,
    proportion_of_informals_added,
    proportion_of_informals_removed,
    proportion_of_misspellings_added,
    proportion_of_misspellings_removed,
    added_badwords_ratio,
    added_informals_ratio,
    added_misspellings_ratio
]
Exemple #15
0
from revscoring.features import Feature, wikitext
from revscoring.features.modifiers import div, log, max, sub


def _process_new_longest(p_longest, r_longest):
    if r_longest > p_longest:
        return r_longest
    else:
        return 1

parent = [
    log(wikitext.revision.parent.chars + 1),
    log(wikitext.revision.parent.tokens + 1),
    log(wikitext.revision.parent.words + 1),
    log(wikitext.revision.parent.uppercase_words + 1),
    log(wikitext.revision.parent.headings + 1),
    log(wikitext.revision.parent.wikilinks + 1),
    log(wikitext.revision.parent.external_links + 1),
    log(wikitext.revision.parent.templates + 1),
    log(wikitext.revision.parent.ref_tags + 1),
    div(wikitext.revision.parent.chars,
        max(wikitext.revision.parent.words, 1),
        name="revision.parent.chars_per_word"),
    div(wikitext.revision.parent.words,
        max(wikitext.revision.parent.tokens, 1),
        name="revision.parent.words_per_token"),
    div(wikitext.revision.parent.uppercase_words,
        max(wikitext.revision.parent.words, 1),
        name="revision.parent.uppercase_words_per_word"),
    div(wikitext.revision.parent.markups,
        max(wikitext.revision.parent.tokens, 1),
Exemple #16
0
    "glwiki.revision.revision",
    wikitext.revision.datasources,
)
paragraphs = mappers.map(
    str, revision.paragraphs_sentences_and_whitespace,
    name="glwiki.revision.paragraphs"
)
paragraphs_without_refs = filters.regex_matching(
    r"^(?!\s*$)((?!<ref>)(.|\n))*$",
    paragraphs,
    name="glwiki.revision.paragraphs_without_refs"
)
paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, paragraphs_without_refs),
    name="glwiki.revision.paragraphs_without_refs_total_length"
)

local_wiki = [
    image_links,
    image_links / max(wikitext.revision.content_chars, 1),
    category_links,
    category_links / max(wikitext.revision.content_chars, 1),
    cn_templates + 1,
    cn_templates / max(wikitext.revision.content_chars, 1),
    log(paragraphs_without_refs_total_length + 1),
    paragraphs_without_refs_total_length /
    max(wikitext.revision.content_chars, 1),
]

wp10 = wikipedia.article + local_wiki
Exemple #17
0
proportion_of_badwords = indonesian.parent_revision.badwords / \
                         max(indonesian.parent_revision.words, 1)
proportion_of_misspellings = indonesian.parent_revision.misspellings / \
                             max(indonesian.parent_revision.words, 1)
proportion_of_informals = indonesian.parent_revision.informals / \
                          max(indonesian.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / \
                           max(proportion_of_misspellings, 0.01)
added_informals_ratio = proportion_of_informals_added / \
                        max(proportion_of_informals, 0.01)

damaging = util.no_lang_damaging + enwiki.badwords + enwiki.informals + [
    log(indonesian.diff.badwords_added + 1),
    log(indonesian.diff.badwords_removed + 1),
    log(indonesian.diff.informals_added + 1),
    log(indonesian.diff.informals_removed + 1),
    log(indonesian.diff.misspellings_added + 1),
    log(indonesian.diff.misspellings_removed + 1),
    proportion_of_badwords_added,
    proportion_of_badwords_removed,
    proportion_of_informals_added,
    proportion_of_informals_removed,
    proportion_of_misspellings_added,
    proportion_of_misspellings_removed,
    added_badwords_ratio,
    added_informals_ratio,
    added_misspellings_ratio
]
Exemple #18
0
    r"Cn",
    r"Fact"
]
cn_templates = templates_that_match("|".join(CN_TEMPLATES),
                                    name="enwiki.revision.cn_templates")

who_templates = templates_that_match("Who",
                                     name="enwiki.revision.cn_templates")

main_article_templates = templates_that_match(
    "Main",
    name="enwiki.main_article_templates")

wp10 = [
    revision.category_links,
    log(revision.content_chars + 1),
    log(revision.image_links + 1),
    revision.image_links / max(revision.content_chars, 1),
    log(cite_templates + 1),
    log((revision.templates - cite_templates) + 1),
    infobox_templates,
    english.revision.infonoise,
    log(revision.internal_links + 1),
    revision.internal_links / max(revision.content_chars, 1),
    revision.level_2_headings,
    revision.level_2_headings / max(revision.content_chars, 1),
    revision.level_3_headings,
    revision.level_3_headings / max(revision.content_chars, 1),
    log(revision.ref_tags + 1),
    revision.ref_tags / max(revision.content_chars, 1),
    log(max((revision.ref_tags - cite_templates) + 1, 1)),
Exemple #19
0
                         max(english.parent_revision.words, 1)
proportion_of_misspellings = english.parent_revision.misspellings / \
                             max(english.parent_revision.words, 1)
proportion_of_informals = english.parent_revision.informals / \
                          max(english.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / \
                           max(proportion_of_misspellings, 0.01)
added_informals_ratio = proportion_of_informals_added / \
                        max(proportion_of_informals, 0.01)

badwords = [
    added_badwords_ratio,
    log(english.diff.badwords_added + 1),
    log(english.diff.badwords_removed + 1),
    proportion_of_badwords_added,
    proportion_of_badwords_removed,
]

informals = [
    added_informals_ratio,
    log(english.diff.informals_added + 1),
    log(english.diff.informals_removed + 1),
    proportion_of_informals_added,
    proportion_of_informals_removed
]

damaging = util.no_lang_damaging + [
    log(english.diff.words_added + 1),
Exemple #20
0
                                 max(turkish.diff.words_removed, 1)
proportion_of_informals_added = turkish.diff.informals_added / \
                                max(turkish.diff.words_added, 1)
proportion_of_informals_removed = turkish.diff.informals_removed / \
                                  max(turkish.diff.words_removed, 1)

proportion_of_badwords = turkish.parent_revision.badwords / \
                         max(turkish.parent_revision.words, 1)
proportion_of_informals = turkish.parent_revision.informals / \
                          max(turkish.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_informals_ratio = proportion_of_informals_added / \
                        max(proportion_of_informals, 0.01)

damaging = util.no_lang_damaging + enwiki.badwords + enwiki.informals + [
    log(turkish.diff.badwords_added + 1),
    log(turkish.diff.badwords_removed + 1),
    log(turkish.diff.informals_added + 1),
    log(turkish.diff.informals_removed + 1),
    proportion_of_badwords_added,
    proportion_of_badwords_removed,
    proportion_of_informals_added,
    proportion_of_informals_removed,
    added_badwords_ratio,
    added_informals_ratio
]

goodfaith = damaging
Exemple #21
0
proportion_of_badwords = italian.parent_revision.badwords / \
                         max(italian.parent_revision.words, 1)
proportion_of_misspellings = italian.parent_revision.misspellings / \
                             max(italian.parent_revision.words, 1)
proportion_of_informals = italian.parent_revision.informals / \
                          max(italian.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / \
                           max(proportion_of_misspellings, 0.01)
added_informals_ratio = proportion_of_informals_added / \
                        max(proportion_of_informals, 0.01)

damaging = util.no_lang_damaging + english.badwords + [
    log(italian.diff.words_added + 1),
    log(italian.diff.words_removed + 1),
    log(italian.parent_revision.words + 1),
    log(italian.diff.badwords_added + 1),
    log(italian.diff.badwords_removed + 1),
    log(italian.diff.informals_added + 1),
    log(italian.diff.informals_removed + 1),
    log(italian.diff.misspellings_added + 1),
    log(italian.diff.misspellings_removed + 1),
    proportion_of_badwords_added,
    proportion_of_badwords_removed,
    proportion_of_informals_added,
    proportion_of_informals_removed,
    proportion_of_misspellings_added,
    proportion_of_misspellings_removed,
    added_badwords_ratio,
Exemple #22
0
from revscoring.features import (
    diff, page, parent_revision, previous_user_revision, revision, user
)
from revscoring.features.modifiers import log
from . import generic

damaging = generic.damaging + [
    log(diff.added_badwords_ratio + 1),
    log(diff.badwords_added + 1),
    log(diff.badwords_removed + 1),
    log(diff.proportion_of_badwords_added + 1),
    log(diff.proportion_of_badwords_removed + 1),
    log(diff.removed_badwords_ratio + 1),
    log(parent_revision.badwords + 1),
    log(parent_revision.proportion_of_badwords + 1),
    log(revision.badwords + 1),
    log(revision.proportion_of_badwords + 1)
]

good_faith = generic.good_faith + [
    log(diff.added_badwords_ratio + 1),
    log(diff.badwords_added + 1),
    log(diff.badwords_removed + 1),
    log(diff.proportion_of_badwords_added + 1),
    log(diff.proportion_of_badwords_removed + 1),
    log(diff.removed_badwords_ratio + 1),
    log(parent_revision.badwords + 1),
    log(parent_revision.proportion_of_badwords + 1),
    log(revision.badwords + 1),
    log(revision.proportion_of_badwords + 1)
]
Exemple #23
0
    all_images, all_images / max(wikitext.revision.content_chars, 1),
    category_links, category_links / max(wikitext.revision.content_chars, 1),
    all_ref_tags, all_ref_tags / max(wikitext.revision.content_chars, 1),
    all_cite_templates,
    all_cite_templates / max(wikitext.revision.content_chars, 1),
    proportion_of_templated_references, non_templated_references,
    non_templated_references / max(wikitext.revision.content_chars, 1),
    non_cite_templates, non_cite_templates /
    max(wikitext.revision.content_chars, 1), infobox_templates,
    cn_templates + 1, cn_templates / max(wikitext.revision.content_chars, 1),
    who_templates + 1, who_templates / max(wikitext.revision.content_chars, 1),
    main_article_templates,
    main_article_templates / max(wikitext.revision.content_chars, 1),
    (english.stemmed.revision.stem_chars /
     max(wikitext.revision.content_chars, 1)),
    log(paragraphs_without_refs_total_length + 1), words_to_watch_count,
    words_to_watch_count / max(wikitext.revision.words, 1), idioms_count,
    idioms_count /
    max(wikitext.revision.words, 1), words_to_watch_count + idioms_count,
    (words_to_watch_count + idioms_count) / max(wikitext.revision.words, 1)
]

wp10 = wikipedia.article + local_wiki
"""
Based largely on work by Morten Warncke-Wang et al.[1] and with a few
improvements and extensions that Morten identified after publication.

1. Warncke-Wang, M., Cosley, D., & Riedl, J. (2013, August). Tell me more: An
   actionable quality model for wikipedia. In Proceedings of the 9th
   International Symposium on Open Collaboration (p. 8). ACM.
   http://opensym.org/wsos2013/proceedings/p0202-warncke.pdf
Exemple #24
0
proportion_of_badwords = portuguese.parent_revision.badwords / \
                         max(portuguese.parent_revision.words, 1)
proportion_of_misspellings = portuguese.parent_revision.misspellings / \
                             max(portuguese.parent_revision.words, 1)
proportion_of_informals = portuguese.parent_revision.informals / \
                          max(portuguese.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / \
                           max(proportion_of_misspellings, 0.01)
added_informals_ratio = proportion_of_informals_added / \
                        max(proportion_of_informals, 0.01)

damaging = util.no_lang_damaging + enwiki.badwords + enwiki.informals + [
    log(portuguese.diff.badwords_added + 1),
    log(portuguese.diff.badwords_removed + 1),
    log(portuguese.diff.informals_added + 1),
    log(portuguese.diff.informals_removed + 1),
    log(portuguese.diff.misspellings_added + 1),
    log(portuguese.diff.misspellings_removed + 1),
    proportion_of_badwords_added,
    proportion_of_badwords_removed,
    proportion_of_informals_added,
    proportion_of_informals_removed,
    proportion_of_misspellings_added,
    proportion_of_misspellings_removed,
    added_badwords_ratio,
    added_informals_ratio,
    added_misspellings_ratio
]
Exemple #25
0
proportion_of_informals_removed = spanish.diff.informals_removed / max(spanish.diff.words_removed, 1)

proportion_of_badwords = spanish.parent_revision.badwords / max(spanish.parent_revision.words, 1)
proportion_of_misspellings = spanish.parent_revision.misspellings / max(spanish.parent_revision.words, 1)
proportion_of_informals = spanish.parent_revision.informals / max(spanish.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / max(proportion_of_misspellings, 0.01)
added_informals_ratio = proportion_of_informals_added / max(proportion_of_informals, 0.01)

damaging = (
    util.no_lang_damaging
    + enwiki.badwords
    + enwiki.informals
    + [
        log(spanish.diff.badwords_added + 1),
        log(spanish.diff.badwords_removed + 1),
        log(spanish.diff.informals_added + 1),
        log(spanish.diff.informals_removed + 1),
        log(spanish.diff.misspellings_added + 1),
        log(spanish.diff.misspellings_removed + 1),
        proportion_of_badwords_added,
        proportion_of_badwords_removed,
        proportion_of_informals_added,
        proportion_of_informals_removed,
        proportion_of_misspellings_added,
        proportion_of_misspellings_removed,
        added_badwords_ratio,
        added_informals_ratio,
        added_misspellings_ratio,
    ]
Exemple #26
0
proportion_of_badwords = vietnamese.parent_revision.badwords / \
                         max(vietnamese.parent_revision.words, 1)
proportion_of_misspellings = vietnamese.parent_revision.misspellings / \
                             max(vietnamese.parent_revision.words, 1)
proportion_of_informals = vietnamese.parent_revision.informals / \
                          max(vietnamese.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / \
                           max(proportion_of_misspellings, 0.01)
added_informals_ratio = proportion_of_informals_added / \
                        max(proportion_of_informals, 0.01)

damaging = util.no_lang_damaging + enwiki.badwords + enwiki.informals + [
    log(vietnamese.diff.badwords_added + 1),
    log(vietnamese.diff.badwords_removed + 1),
    log(vietnamese.diff.informals_added + 1),
    log(vietnamese.diff.informals_removed + 1),
    log(vietnamese.diff.misspellings_added + 1),
    log(vietnamese.diff.misspellings_removed + 1),
    proportion_of_badwords_added,
    proportion_of_badwords_removed,
    proportion_of_informals_added,
    proportion_of_informals_removed,
    proportion_of_misspellings_added,
    proportion_of_misspellings_removed,
    added_badwords_ratio,
    added_informals_ratio,
    added_misspellings_ratio
]