Example #1
0
def test_max():

    max_five_six_seven = modifiers.max(5, 6, 7)

    assert solve(max_five_six_seven) == 7

    assert solve(pickle.loads(pickle.dumps(max_five_six_seven))) == 7

    assert repr(max_five_six_seven) == "<feature.max(5, 6, 7)>"
Example #2
0
def test_trim():

    d1 = Datasource("derp1")
    f1 = Feature("foobar1", returns=int)
    f2 = Feature("foobar2", returns=int, depends_on=[d1])
    c = Constant(value=5)
    fv = FeatureVector("foobar3", returns=int, depends_on=[c])

    assert list(trim(f1)) == [f1]
    assert list(trim([f1, f2, fv])) == [f1, f2, fv]
    assert (list(trim(log(max(f1 - f2, 1)))) == [f1, f2])
Example #3
0
def test_trim():

    d1 = Datasource("derp1")
    f1 = Feature("foobar1", returns=int)
    f2 = Feature("foobar2", returns=int, depends_on=[d1])
    c = Constant(value=5)
    fv = FeatureVector("foobar3", returns=int, depends_on=[c])

    assert list(trim(f1)) == [f1]
    assert list(trim([f1, f2, fv])) == [f1, f2, fv]
    assert (list(trim(log(max(f1 - f2, 1)))) ==
            [f1, f2])
Example #4
0
"""
Turkish Wikipedia
+++++++++++++++++
"""

from revscoring.features import wikitext
from revscoring.features.modifiers import log, max, sub

from . import wikipedia

cite_templates = wikitext.revision.template_names_matching(
    r"Kaynak|.*[ _]kaynağı",
    name="trwiki.revision.cite_templates")
proportion_of_templated_references = \
    cite_templates / max(wikitext.revision.ref_tags, 1)
non_templated_references = max(wikitext.revision.ref_tags - cite_templates, 0)
non_cite_templates = sub(
    wikitext.revision.templates, cite_templates,
    name="trwiki.revision.non_cite_templates"
)
infobox_templates = wikitext.revision.template_names_matching(
    r".*[ _]bilgi[ _]kutusu",
    name="trwiki.revision.infobox_templates")

# Copied (2015-10-29) from:
# https://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Citez_vos_sources#R.C3.A9clamation_et_contestation_de_sources
cn_templates = wikitext.revision.template_names_matching(
    r"Kaynak[ _]belirt|Olgu|Fact|Delil",
    name="trwiki.revision.lvl1_cn_templates")

main_article_templates = wikitext.revision.template_names_matching(
Example #5
0
# Copied (2015-10-29) from:
# https://fr.wikipedia.org/wiki/Cat%C3%A9gorie:Mod%C3%A8le_pour_bibliographie
CITE_TEMPLATES = [
    r"Article",
    r"Chapitre",
    r"Jugement",
    r"Lien[ _]web",
    r"Loi",
    r"Ouvrage"
]
cite_templates = wikitext.revision.template_names_matching(
    "|".join(CITE_TEMPLATES),
    name="frwiki.revision.cite_templates")
proportion_of_templated_references = \
    cite_templates / max(wikitext.revision.ref_tags, 1)
non_templated_references = max(wikitext.revision.ref_tags - cite_templates, 0)
non_cite_templates = sub(
    wikitext.revision.templates, cite_templates,
    name="frwiki.revision.non_cite_templates"
)
infobox_templates = wikitext.revision.template_names_matching(
    r"^infobox",
    name="frwiki.revision.infobox_templates")

# Copied (2015-10-29) from:
# https://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Citez_vos_sources#R.C3.A9clamation_et_contestation_de_sources
LVL1_CN_TEMPLATES = [r"Référence[ _]souhaitée", r"Citation[ _]nécessaire",
                     r"Référence[ _]à[ _]confirmer", r"Référence[ _]nécessaire",
                     r"Inédit"]
lvl1_cn_templates = wikitext.revision.template_names_matching(
Example #6
0
    r"Harvard citation text", r"harvtxt",
    r"Harvcoltxt",
    r"Harvcol",
    r"Harvcolnb",
    r"Harvard citations", r"harvs",
    r"Harvp",
    r"Citation"
]
cite_templates = wikitext.revision.template_names_matching(
    "|".join(CITE_TEMPLATES), name="ukwiki.revision.cite_templates")
shortened_footnote_templates = wikitext.revision.template_names_matching(
    "sfn", name="ukwiki.revision.shortened_footnote_templates")
all_ref_tags = shortened_footnote_templates + wikitext.revision.ref_tags
all_cite_templates = cite_templates + shortened_footnote_templates
proportion_of_templates_references = \
    all_cite_templates / max(all_ref_tags, 1)
non_templated_references = max(all_ref_tags - all_cite_templates, 0)
non_cite_templates = sub(
        wikitext.revision.templates, all_cite_templates,
        name="ukwiki.revision.non_cite_templates"
)

# Links
CATEGORY_LINKS = [
    r"Категорія",
    r"Category",
    r"Категория"
]
category_links = wikitext.revision.wikilink_titles_matching(
    "|".join(CATEGORY_LINKS), name="ukwiki.revision.category_links")
Example #7
0
    "glwiki.revision.revision",
    wikitext.revision.datasources,
)
paragraphs = mappers.map(
    str, revision.paragraphs_sentences_and_whitespace,
    name="glwiki.revision.paragraphs"
)
paragraphs_without_refs = filters.regex_matching(
    r"^(?!\s*$)((?!<ref>)(.|\n))*$",
    paragraphs,
    name="glwiki.revision.paragraphs_without_refs"
)
paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, paragraphs_without_refs),
    name="glwiki.revision.paragraphs_without_refs_total_length"
)

local_wiki = [
    image_links,
    image_links / max(wikitext.revision.content_chars, 1),
    category_links,
    category_links / max(wikitext.revision.content_chars, 1),
    cn_templates + 1,
    cn_templates / max(wikitext.revision.content_chars, 1),
    log(paragraphs_without_refs_total_length + 1),
    paragraphs_without_refs_total_length /
    max(wikitext.revision.content_chars, 1),
]

wp10 = wikipedia.article + local_wiki
Example #8
0
    else:
        return 1


parent = [
    log(wikitext.revision.parent.chars + 1),
    log(wikitext.revision.parent.tokens + 1),
    log(wikitext.revision.parent.words + 1),
    log(wikitext.revision.parent.uppercase_words + 1),
    log(wikitext.revision.parent.headings + 1),
    log(wikitext.revision.parent.wikilinks + 1),
    log(wikitext.revision.parent.external_links + 1),
    log(wikitext.revision.parent.templates + 1),
    log(wikitext.revision.parent.ref_tags + 1),
    div(wikitext.revision.parent.chars,
        max(wikitext.revision.parent.words, 1),
        name="revision.parent.chars_per_word"),
    div(wikitext.revision.parent.words,
        max(wikitext.revision.parent.tokens, 1),
        name="revision.parent.words_per_token"),
    div(wikitext.revision.parent.uppercase_words,
        max(wikitext.revision.parent.words, 1),
        name="revision.parent.uppercase_words_per_word"),
    div(wikitext.revision.parent.markups,
        max(wikitext.revision.parent.tokens, 1),
        name="revision.parent.markups_per_token"),
]

diff = [
    wikitext.revision.diff.markup_delta_sum,
    wikitext.revision.diff.markup_delta_increase,
Example #9
0
    wikitext.revision.datasources,
)
paragraphs = mappers.map(str,
                         revision.paragraphs_sentences_and_whitespace,
                         name="euwiki.revision.paragraphs")
paragraphs_without_refs = filters.regex_matching(
    r"^(?!\s*$)((?!<ref>)(.|\n))*$",
    paragraphs,
    name="euwiki.revision.paragraphs_without_refs")
paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, paragraphs_without_refs),
    name="euwiki.revision.paragraphs_without_refs_total_length")

local_wiki = [
    image_links,
    image_links / max(wikitext.revision.content_chars, 1),
    # category_links,
    # category_links / max(wikitext.revision.content_chars, 1),
    infobox_templates,
    cn_templates + 1,
    cn_templates / max(wikitext.revision.content_chars, 1),
    log(paragraphs_without_refs_total_length + 1),
    basque.dictionary.revision.dict_words,
    basque.dictionary.revision.dict_words / max(wikitext.revision.words, 1),
    english.dictionary.revision.dict_words,
    english.dictionary.revision.dict_words / max(wikitext.revision.words, 1),
    spanish.dictionary.revision.dict_words,
    spanish.dictionary.revision.dict_words / max(wikitext.revision.words, 1),
]

wp10 = wikipedia.article + local_wiki
Example #10
0
from revscoring.features import diff, page, parent_revision, revision, user
from revscoring.features.modifiers import log, max
from revscoring.languages import portuguese

from . import enwiki

proportion_of_badwords_added = portuguese.diff.badwords_added / \
                               max(portuguese.diff.words_added, 1)
proportion_of_badwords_removed = portuguese.diff.badwords_added / \
                                 max(portuguese.diff.words_added, 1)
proportion_of_misspellings_added = portuguese.diff.misspellings_added / \
                                   max(portuguese.diff.words_added, 1)
proportion_of_misspellings_removed = portuguese.diff.misspellings_added / \
                                     max(portuguese.diff.words_added, 1)
proportion_of_informals_added = portuguese.diff.informals_added / \
                                max(portuguese.diff.words_added, 1)
proportion_of_informals_removed = portuguese.diff.informals_added / \
                                  max(portuguese.diff.words_added, 1)

proportion_of_badwords = portuguese.parent_revision.badwords / \
                         max(portuguese.parent_revision.words, 1)
proportion_of_misspellings = portuguese.parent_revision.misspellings / \
                             max(portuguese.parent_revision.words, 1)
proportion_of_informals = portuguese.parent_revision.informals / \
                          max(portuguese.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / \
                           max(proportion_of_misspellings, 0.01)
added_informals_ratio = proportion_of_informals_added / \
Example #11
0
"""
French Wikisource
+++++++++++++++++
"""

from revscoring.features import wikitext
from revscoring.features.modifiers import max
from revscoring.languages import french

from . import wikisource

local_wiki = [
    wikitext.revision.chars, french.stemmed.revision.stem_chars,
    french.stemmed.revision.stem_chars / max(wikitext.revision.chars, 1),
    french.dictionary.revision.dict_words / max(wikitext.revision.words, 1),
    french.dictionary.revision.dict_words /
    max(french.dictionary.revision.non_dict_words, 1)
]

pagelevel = local_wiki + wikisource.page
Example #12
0
unique_sources_count = aggregators.len(unique_sources)
"`int` : A count of unique sources in the revision"

# Status
is_human = wikibase_.revision.has_property_value(
    properties.INSTANCE_OF, items.HUMAN, name=name + '.is_human')
has_birthday = wikibase_.revision.has_property(
    properties.DATE_OF_BIRTH, name='revision.has_birthday')
dead = wikibase_.revision.has_property(
    properties.DATE_OF_DEATH, name='revision.dead')
is_blp = has_birthday.and_(not_(dead))

local_wiki = [
    is_human,
    is_blp,
    aggregators.len(complete_translations),
    aggregators.len(important_label_translations),
    aggregators.len(important_description_translations),
    aggregators.len(important_complete_translations),
    source_claims_count,
    wikimedia_sources_count,
    wikimedia_sources_count / modifiers.max(source_claims_count, 1),
    external_sources_count,
    external_sources_count / modifiers.max(source_claims_count, 1),
    unique_sources_count,
    unique_sources_count / modifiers.max(source_claims_count, 1)
]

item_quality = wikibase.item + local_wiki
Example #13
0
        return r_longest
    else:
        return 1

parent = [
    log(wikitext.revision.parent.chars + 1),
    log(wikitext.revision.parent.tokens + 1),
    log(wikitext.revision.parent.words + 1),
    log(wikitext.revision.parent.uppercase_words + 1),
    log(wikitext.revision.parent.headings + 1),
    log(wikitext.revision.parent.wikilinks + 1),
    log(wikitext.revision.parent.external_links + 1),
    log(wikitext.revision.parent.templates + 1),
    log(wikitext.revision.parent.ref_tags + 1),
    div(wikitext.revision.parent.chars,
        max(wikitext.revision.parent.words, 1),
        name="revision.parent.chars_per_word"),
    div(wikitext.revision.parent.words,
        max(wikitext.revision.parent.tokens, 1),
        name="revision.parent.words_per_token"),
    div(wikitext.revision.parent.uppercase_words,
        max(wikitext.revision.parent.words, 1),
        name="revision.parent.uppercase_words_per_word"),
    div(wikitext.revision.parent.markups,
        max(wikitext.revision.parent.tokens, 1),
        name="revision.parent.markups_per_token"),
]

diff = [
    wikitext.revision.diff.markup_delta_sum,
    wikitext.revision.diff.markup_delta_increase,
Example #14
0
]
cite_templates = wikitext.revision.template_names_matching(
    "|".join(CITE_TEMPLATES), name="enwiki.revision.cite_templates")
SFN_TEMPLATES = [
    r"Shortened footnote template", r"sfn",
    r"Sfnp",
    r"Sfnm",
    r"Sfnmp"
]
shortened_footnote_templates = wikitext.revision.template_names_matching(
    "|".join(SFN_TEMPLATES),
    name="enwiki.revision.shortened_footnote_templates")
all_ref_tags = shortened_footnote_templates + wikitext.revision.ref_tags
all_cite_templates = cite_templates + shortened_footnote_templates
proportion_of_templated_references = \
    all_cite_templates / max(all_ref_tags, 1)
non_templated_references = max(all_ref_tags - all_cite_templates, 0)
non_cite_templates = sub(
    wikitext.revision.templates, all_cite_templates,
    name="enwiki.revision.non_cite_templates"
)

# Links
category_links = wikitext.revision.wikilink_titles_matching(
    r"Category\:", name="enwiki.revision.category_links")
image_links = wikitext.revision.wikilink_titles_matching(
    r"File|Image\:", name="enwiki.revision.image_links")

# References
revision = Revision(
    "enwiki.revision.revision",
Example #15
0
        revision_oriented_datasources.revision.page.suggested.properties])

# Status
is_human = wikibase_.revision.has_property_value(
    properties.INSTANCE_OF, items.HUMAN, name=name + '.revision.is_human')
has_birthday = wikibase_.revision.has_property(
    properties.DATE_OF_BIRTH, name=name + '.revision.has_birthday')
dead = wikibase_.revision.has_property(
    properties.DATE_OF_DEATH, name=name + '.revision.dead')
is_blp = has_birthday.and_(not_(dead))


local_wiki = [
    is_human,
    is_blp,
    aggregators.len(complete_translations),
    aggregators.len(important_label_translations),
    aggregators.len(important_description_translations),
    aggregators.len(important_complete_translations),
    references_count,
    wikimedia_references_count,
    wikimedia_references_count / modifiers.max(references_count, 1),
    external_references_count,
    external_references_count / modifiers.max(references_count, 1),
    unique_references_count,
    unique_references_count / modifiers.max(references_count, 1),
    item_completeness
]

item_quality = wikibase.item + local_wiki
Example #16
0
    r"Harvard citation", r"harv", r"Harvard citation text", r"harvtxt",
    r"Harvcoltxt", r"Harvcol", r"Harvcolnb", r"Harvard citations", r"harvs",
    r"Harvp"
]
cite_templates = wikitext.revision.template_names_matching(
    "|".join(CITE_TEMPLATES), name="enwiki.revision.cite_templates")
SFN_TEMPLATES = [
    r"Shortened footnote template", r"sfn", r"Sfnp", r"Sfnm", r"Sfnmp"
]
shortened_footnote_templates = wikitext.revision.template_names_matching(
    "|".join(SFN_TEMPLATES),
    name="enwiki.revision.shortened_footnote_templates")
all_ref_tags = shortened_footnote_templates + wikitext.revision.ref_tags
all_cite_templates = cite_templates + shortened_footnote_templates
proportion_of_templated_references = \
    all_cite_templates / max(all_ref_tags, 1)
non_templated_references = max(all_ref_tags - all_cite_templates, 0)
non_cite_templates = sub(wikitext.revision.templates,
                         all_cite_templates,
                         name="enwiki.revision.non_cite_templates")

# Links
category_links = wikitext.revision.wikilink_titles_matching(
    r"Category\:", name="enwiki.revision.category_links")

image_links = wikitext.revision.wikilink_titles_matching(
    r"File|Image\:", name="enwiki.revision.image_links")

image_templates = wikitext.revision.template_names_matching(
    r"((Wide|Tall|scalable) image)|Panorama|Panorama 2",
    name='enwiki.revision.image_template')
Example #17
0
from revscoring.features.modifiers import log, max
from revscoring.languages import spanish

from . import enwiki, util

proportion_of_badwords_added = spanish.diff.badwords_added / max(spanish.diff.words_added, 1)
proportion_of_badwords_removed = spanish.diff.badwords_removed / max(spanish.diff.words_removed, 1)
proportion_of_misspellings_added = spanish.diff.misspellings_added / max(spanish.diff.words_added, 1)
proportion_of_misspellings_removed = spanish.diff.misspellings_removed / max(spanish.diff.words_removed, 1)
proportion_of_informals_added = spanish.diff.informals_added / max(spanish.diff.words_added, 1)
proportion_of_informals_removed = spanish.diff.informals_removed / max(spanish.diff.words_removed, 1)

proportion_of_badwords = spanish.parent_revision.badwords / max(spanish.parent_revision.words, 1)
proportion_of_misspellings = spanish.parent_revision.misspellings / max(spanish.parent_revision.words, 1)
proportion_of_informals = spanish.parent_revision.informals / max(spanish.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / max(proportion_of_misspellings, 0.01)
added_informals_ratio = proportion_of_informals_added / max(proportion_of_informals, 0.01)

damaging = (
    util.no_lang_damaging
    + enwiki.badwords
    + enwiki.informals
    + [
        log(spanish.diff.badwords_added + 1),
        log(spanish.diff.badwords_removed + 1),
        log(spanish.diff.informals_added + 1),
        log(spanish.diff.informals_removed + 1),
        log(spanish.diff.misspellings_added + 1),
        log(spanish.diff.misspellings_removed + 1),
Example #18
0
from revscoring.features import wikitext, modifiers

article = [
    wikitext.revision.chars,
    wikitext.revision.content_chars,
    wikitext.revision.ref_tags,
    (wikitext.revision.ref_tags /
     modifiers.max(wikitext.revision.content_chars, 1)),
    wikitext.revision.wikilinks,
    (wikitext.revision.wikilinks /
     modifiers.max(wikitext.revision.content_chars, 1)),
    wikitext.revision.external_links,
    (wikitext.revision.external_links /
     modifiers.max(wikitext.revision.content_chars, 1)),
    wikitext.revision.headings_by_level(2),
    (wikitext.revision.headings_by_level(2) /
     modifiers.max(wikitext.revision.content_chars, 1)),
    wikitext.revision.headings_by_level(3),
    (wikitext.revision.headings_by_level(3) /
     modifiers.max(wikitext.revision.content_chars, 1))
]
Example #19
0
# References
revision = Revision(
    "glwiki.revision.revision",
    wikitext.revision.datasources,
)
paragraphs = mappers.map(str,
                         revision.paragraphs_sentences_and_whitespace,
                         name="glwiki.revision.paragraphs")
paragraphs_without_refs = filters.regex_matching(
    r"^(?!\s*$)((?!<ref>)(.|\n))*$",
    paragraphs,
    name="glwiki.revision.paragraphs_without_refs")
paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, paragraphs_without_refs),
    name="glwiki.revision.paragraphs_without_refs_total_length")

local_wiki = [
    image_links,
    image_links / max(wikitext.revision.content_chars, 1),
    category_links,
    category_links / max(wikitext.revision.content_chars, 1),
    cn_templates + 1,
    cn_templates / max(wikitext.revision.content_chars, 1),
    log(paragraphs_without_refs_total_length + 1),
    paragraphs_without_refs_total_length /
    max(wikitext.revision.content_chars, 1),
]

wp10 = wikipedia.article + local_wiki
Example #20
0
English Wikipedia
+++++++++++++++++
"""

from revscoring.features import revision
from revscoring.features.modifiers import log, max
from revscoring.languages import english

from ..features.revision import templates_that_match

cite_templates = templates_that_match(
    r"cite", name="enwiki.revision.cite_templates")
infobox_templates = templates_that_match(
    r"infobox", name="enwiki.revision.infobox_templates")

proportion_of_templated_references = cite_templates / max(revision.ref_tags, 1)

CN_TEMPLATES = [
    r"Citation needed",
    r"Cn",
    r"Fact"
]
cn_templates = templates_that_match("|".join(CN_TEMPLATES),
                                    name="enwiki.revision.cn_templates")

who_templates = templates_that_match("Who",
                                     name="enwiki.revision.cn_templates")

main_article_templates = templates_that_match(
    "Main",
    name="enwiki.main_article_templates")
paragraphs = mappers.map(
    str,
    wikitext.revision.datasources.paragraphs_sentences_and_whitespace,
    name="enwiki.revision.paragraphs")
paragraphs_without_refs = filters.regex_matching(
    r"^(?!\s*$)((?!<ref>)(.|\n))*$",
    paragraphs,
    name="enwiki.revision.paragraphs_without_refs")
paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, paragraphs_without_refs),
    name="enwiki.revision.paragraphs_without_refs_total_length")

local_wiki = [
    dutch.stemmed.revision.stem_chars,
    (dutch.stemmed.revision.stem_chars /
     max(wikitext.revision.content_chars, 1)),
    image_links,
    image_links / max(wikitext.revision.content_chars, 1),
    category_links,
    category_links / max(wikitext.revision.content_chars, 1),
    dutch.dictionary.revision.dict_words,
    dutch.dictionary.revision.dict_words / max(wikitext.revision.words, 1),
    paragraphs_without_refs_total_length,
    paragraphs_without_refs_total_length /
    max(wikitext.revision.content_chars, 1),
    cn_templates,
    cn_templates / max(wikitext.revision.content_chars, 1),
]

wp10 = local_wiki + wikipedia.article
Example #22
0
from revscoring.features import wikitext
from revscoring.features.modifiers import max, sub
from revscoring.languages import portuguese

char_based = [
    wikitext.revision.chars, wikitext.revision.whitespace_chars,
    wikitext.revision.markup_chars, wikitext.revision.cjk_chars,
    wikitext.revision.entity_chars, wikitext.revision.url_chars,
    wikitext.revision.word_chars, wikitext.revision.uppercase_word_chars,
    wikitext.revision.punctuation_chars, wikitext.revision.break_chars,
    wikitext.revision.longest_repeated_char,
    wikitext.revision.whitespace_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.markup_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.cjk_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.entity_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.url_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.word_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.uppercase_word_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.punctuation_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.break_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.longest_repeated_char / max(wikitext.revision.chars, 1)
]

token_based = [
    wikitext.revision.tokens, wikitext.revision.numbers,
    wikitext.revision.whitespaces, wikitext.revision.markups,
    wikitext.revision.cjks, wikitext.revision.entities, wikitext.revision.urls,
    wikitext.revision.words, wikitext.revision.uppercase_words,
    wikitext.revision.punctuations, wikitext.revision.breaks,
    wikitext.revision.longest_token, wikitext.revision.longest_word,
    wikitext.revision.numbers / max(wikitext.revision.tokens, 1),
Example #23
0
from revscoring.features import diff, page, parent_revision, revision, user
from revscoring.features.modifiers import log, max
from revscoring.languages import indonesian

from . import enwiki

proportion_of_badwords_added = indonesian.diff.badwords_added / \
                               max(indonesian.diff.words_added, 1)
proportion_of_badwords_removed = indonesian.diff.badwords_added / \
                                 max(indonesian.diff.words_added, 1)
proportion_of_misspellings_added = indonesian.diff.misspellings_added / \
                                   max(indonesian.diff.words_added, 1)
proportion_of_misspellings_removed = indonesian.diff.misspellings_added / \
                                     max(indonesian.diff.words_added, 1)
proportion_of_informals_added = indonesian.diff.informals_added / \
                                max(indonesian.diff.words_added, 1)
proportion_of_informals_removed = indonesian.diff.informals_added / \
                                  max(indonesian.diff.words_added, 1)

proportion_of_badwords = indonesian.parent_revision.badwords / \
                         max(indonesian.parent_revision.words, 1)
proportion_of_misspellings = indonesian.parent_revision.misspellings / \
                             max(indonesian.parent_revision.words, 1)
proportion_of_informals = indonesian.parent_revision.informals / \
                          max(indonesian.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / \
                           max(proportion_of_misspellings, 0.01)
added_informals_ratio = proportion_of_informals_added / \
Example #24
0
from revscoring.features import wikitext
from revscoring.features.modifiers import log, max, sub
from revscoring.languages import french

from . import wikipedia

# Copied (2015-10-29) from:
# https://fr.wikipedia.org/wiki/Cat%C3%A9gorie:Mod%C3%A8le_pour_bibliographie
CITE_TEMPLATES = [
    r"Article", r"Chapitre", r"Jugement", r"Lien[ _]web", r"Loi", r"Ouvrage"
]
cite_templates = wikitext.revision.template_names_matching(
    "|".join(CITE_TEMPLATES), name="frwiki.revision.cite_templates")
proportion_of_templated_references = \
    cite_templates / max(wikitext.revision.ref_tags, 1)
non_templated_references = max(wikitext.revision.ref_tags - cite_templates, 0)
non_cite_templates = sub(wikitext.revision.templates,
                         cite_templates,
                         name="frwiki.revision.non_cite_templates")
infobox_templates = wikitext.revision.template_names_matching(
    r"^infobox", name="frwiki.revision.infobox_templates")

# Copied (2015-10-29) from:
# https://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Citez_vos_sources#R.C3.A9clamation_et_contestation_de_sources
LVL1_CN_TEMPLATES = [
    r"Référence[ _]souhaitée", r"Citation[ _]nécessaire",
    r"Référence[ _]à[ _]confirmer", r"Référence[ _]nécessaire", r"Inédit"
]
lvl1_cn_templates = wikitext.revision.template_names_matching(
    "|".join(LVL1_CN_TEMPLATES), name="frwiki.revision.lvl1_cn_templates")
Example #25
0
"`int` : A count of all sources which do not come from Wikimedia projects"

unique_sources_count = aggregators.len(unique_sources)
"`int` : A count of unique sources in the revision"

# Status
is_human = wikibase_.revision.has_property_value(properties.INSTANCE_OF,
                                                 items.HUMAN,
                                                 name=name + '.is_human')
has_birthday = wikibase_.revision.has_property(properties.DATE_OF_BIRTH,
                                               name='revision.has_birthday')
dead = wikibase_.revision.has_property(properties.DATE_OF_DEATH,
                                       name='revision.dead')
is_blp = has_birthday.and_(not_(dead))

local_wiki = [
    is_human, is_blp,
    aggregators.len(complete_translations),
    aggregators.len(important_label_translations),
    aggregators.len(important_description_translations),
    aggregators.len(important_complete_translations), source_claims_count,
    wikimedia_sources_count,
    wikimedia_sources_count / modifiers.max(source_claims_count, 1),
    external_sources_count,
    external_sources_count / modifiers.max(source_claims_count, 1),
    unique_sources_count,
    unique_sources_count / modifiers.max(source_claims_count, 1)
]

item_quality = wikibase.item + local_wiki
Example #26
0
from revscoring.features.modifiers import log, max
from revscoring.languages import turkish

from . import enwiki, util

proportion_of_badwords_added = turkish.diff.badwords_added / \
                               max(turkish.diff.words_added, 1)
proportion_of_badwords_removed = turkish.diff.badwords_removed / \
                                 max(turkish.diff.words_removed, 1)
proportion_of_informals_added = turkish.diff.informals_added / \
                                max(turkish.diff.words_added, 1)
proportion_of_informals_removed = turkish.diff.informals_removed / \
                                  max(turkish.diff.words_removed, 1)

proportion_of_badwords = turkish.parent_revision.badwords / \
                         max(turkish.parent_revision.words, 1)
proportion_of_informals = turkish.parent_revision.informals / \
                          max(turkish.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_informals_ratio = proportion_of_informals_added / \
                        max(proportion_of_informals, 0.01)

damaging = util.no_lang_damaging + enwiki.badwords + enwiki.informals + [
    log(turkish.diff.badwords_added + 1),
    log(turkish.diff.badwords_removed + 1),
    log(turkish.diff.informals_added + 1),
    log(turkish.diff.informals_removed + 1),
    proportion_of_badwords_added,
    proportion_of_badwords_removed,
Example #27
0
from revscoring.features.modifiers import log, max
from revscoring.languages import english

from . import util

proportion_of_badwords_added = english.diff.badwords_added / \
                               max(english.diff.words_added, 1)
proportion_of_badwords_removed = english.diff.badwords_removed / \
                                 max(english.diff.words_removed, 1)
proportion_of_misspellings_added = english.diff.misspellings_added / \
                                   max(english.diff.words_added, 1)
proportion_of_misspellings_removed = english.diff.misspellings_removed / \
                                     max(english.diff.words_removed, 1)
proportion_of_informals_added = english.diff.informals_added / \
                                max(english.diff.words_added, 1)
proportion_of_informals_removed = english.diff.informals_removed / \
                                  max(english.diff.words_removed, 1)

proportion_of_badwords = english.parent_revision.badwords / \
                         max(english.parent_revision.words, 1)
proportion_of_misspellings = english.parent_revision.misspellings / \
                             max(english.parent_revision.words, 1)
proportion_of_informals = english.parent_revision.informals / \
                          max(english.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / \
                           max(proportion_of_misspellings, 0.01)
added_informals_ratio = proportion_of_informals_added / \
                        max(proportion_of_informals, 0.01)
unique_references_count = aggregators.len(unique_references)
"`int` : A count of unique sources in the revision"

# Status
is_human = wikibase_.revision.has_property_value(
    properties.INSTANCE_OF, items.HUMAN, name=name + '.revision.is_human')
has_birthday = wikibase_.revision.has_property(
    properties.DATE_OF_BIRTH, name=name + '.revision.has_birthday')
dead = wikibase_.revision.has_property(
    properties.DATE_OF_DEATH, name=name + '.revision.dead')
is_blp = has_birthday.and_(not_(dead))

local_wiki = [
    is_human,
    is_blp,
    aggregators.len(complete_translations),
    aggregators.len(important_label_translations),
    aggregators.len(important_description_translations),
    aggregators.len(important_complete_translations),
    references_count,
    wikimedia_references_count,
    wikimedia_references_count / modifiers.max(references_count, 1),
    external_references_count,
    external_references_count / modifiers.max(references_count, 1),
    unique_references_count,
    unique_references_count / modifiers.max(references_count, 1)
]

item_quality = wikibase.item + local_wiki
Example #29
0
"""

from revscoring.features import wikitext
from revscoring.features.modifiers import max
from revscoring.languages import swedish

from . import wikipedia

cn_templates = wikitext.revision.template_names_matching(
    r"Källa[ _]behövs|Kb",
    name="svwiki.revision.cn_templates")

# Links
category_links = wikitext.revision.wikilink_titles_matching(
    r"Category|Kategori\:", name="revision.category_links")
image_links = wikitext.revision.wikilink_titles_matching(
    r"File|Image|Fil\:", name="revision.image_links")

local_wiki = [
    image_links,
    image_links / max(wikitext.revision.content_chars, 1),
    category_links,
    category_links / max(wikitext.revision.content_chars, 1),
    swedish.dictionary.revision.dict_words,
    swedish.dictionary.revision.dict_words / max(wikitext.revision.words, 1),
    cn_templates,
    cn_templates / max(wikitext.revision.content_chars, 1),
]

wp10 = local_wiki + wikipedia.article
Example #30
0
"""
French Wikisource
+++++++++++++++++
"""

from revscoring.features import wikitext
from revscoring.features.modifiers import max
from revscoring.languages import french

from . import wikisource

local_wiki = [
    wikitext.revision.chars,
    french.stemmed.revision.stem_chars,
    french.stemmed.revision.stem_chars /
        max(wikitext.revision.chars, 1),
    french.dictionary.revision.dict_words / max(wikitext.revision.words, 1),
    french.dictionary.revision.dict_words /
    max(french.dictionary.revision.non_dict_words, 1)
]

pagelevel = local_wiki + wikisource.page
Example #31
0
from revscoring.features.modifiers import log, max
from revscoring.languages import french

from . import enwiki, util

proportion_of_badwords_added = french.diff.badwords_added / \
                               max(french.diff.words_added, 1)
proportion_of_badwords_removed = french.diff.badwords_removed / \
                                 max(french.diff.words_removed, 1)
proportion_of_misspellings_added = french.diff.misspellings_added / \
                                   max(french.diff.words_added, 1)
proportion_of_misspellings_removed = french.diff.misspellings_removed / \
                                     max(french.diff.words_removed, 1)

proportion_of_badwords = french.parent_revision.badwords / \
                         max(french.parent_revision.words, 1)
proportion_of_misspellings = french.parent_revision.misspellings / \
                             max(french.parent_revision.words, 1)

added_badwords_ratio = proportion_of_badwords_added / \
                       max(proportion_of_badwords, 0.01)
added_misspellings_ratio = proportion_of_misspellings_added / \
                           max(proportion_of_misspellings, 0.01)

damaging = util.no_lang_damaging + enwiki.badwords + enwiki.informals + [
    log(french.diff.badwords_added + 1),
    log(french.diff.badwords_removed + 1),
    log(french.diff.misspellings_added + 1),
    log(french.diff.misspellings_removed + 1),
    proportion_of_badwords_added,
    proportion_of_badwords_removed,
Example #32
0
# <ref name="derp">...</ref> (in another page
# <ref following="derp" name="otherderp">...</ref>)
# TODO

# <big>,<small>,<center>,<div>,<span>,<b>,<i>,<poem>,<section>,''',''
good_tags = wikitext.revision.tag_names_matching(
    r"big|small|center|div|span|b|i|poem|section",
    name="wikitext.revision.good_tags")
expected_markup = aggregators.len(
    wikitext.revision.datasources.tokens_matching(r"'''|''"),
    name="wiktext.revision.expected_markup")

page = [
    wikitext.revision.chars,
    wikitext.revision.content_chars,
    wikitext.revision.content_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.markup_chars,
    wikitext.revision.markup_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.whitespace_chars,
    wikitext.revision.whitespace_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.entity_chars,
    wikitext.revision.entity_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.punctuation_chars,
    wikitext.revision.punctuation_chars / max(wikitext.revision.chars, 1),
    wikitext.revision.longest_repeated_char,
    wikitext.revision.numbers,
    wikitext.revision.numbers / max(wikitext.revision.words, 1),
    wikitext.revision.uppercase_words,
    wikitext.revision.uppercase_words / max(wikitext.revision.words, 1),
    wikitext.revision.longest_token,
    wikitext.revision.longest_word,
Example #33
0
from . import wikipedia

# Templates
infobox_templates = wikitext.revision.template_names_matching(
    r"infobox", name="enwiki.revision.infobox_templates")
CN_TEMPLATES = [r"Citation[_ ]needed", r"Cn", r"Fact"]
cn_templates = wikitext.revision.template_names_matching(
    "|".join(CN_TEMPLATES), name="enwiki.revision.cn_templates")
who_templates = wikitext.revision.template_names_matching(
    "Who", name="enwiki.revision.who_templates")
main_article_templates = wikitext.revision.template_names_matching(
    "Main", name="enwiki.main_article_templates")
cite_templates = wikitext.revision.template_names_matching(
    r"cite", name="enwiki.revision.cite_templates")
proportion_of_templated_references = \
    cite_templates / max(wikitext.revision.ref_tags, 1)
non_templated_references = max(wikitext.revision.ref_tags - cite_templates, 0)
non_cite_templates = sub(wikitext.revision.templates,
                         cite_templates,
                         name="enwiki.revision.non_cite_templates")

# Links
category_links = wikitext.revision.wikilink_titles_matching(
    r"Category\:", name="enwiki.revision.category_links")
image_links = wikitext.revision.wikilink_titles_matching(
    r"File|Image\:", name="enwiki.revision.image_links")

# References
revision = Revision(
    "enwiki.revision.revision",
    wikitext.revision.datasources,
Example #34
0

def vectorize_words(words):
    return vectorizers.word2vec.vectorize_words(enwiki_kvs, words)


revision_text_vectors = vectorizers.word2vec(mappers.lower_case(
    wikitext.revision.datasources.words),
                                             vectorize_words,
                                             name="revision.text.en_vectors")

w2v = aggregators.mean(revision_text_vectors,
                       vector=True,
                       name="revision.text.en_vectors_mean")

female_pronouns = wikitext.revision.datasources.tokens_matching(
    r"\b(she|her|hers)\b")
male_pronouns = wikitext.revision.datasources.tokens_matching(
    r"\b(he|him|his)\b")
female_pronouns_count = aggregators.len(female_pronouns)
male_pronouns_count = aggregators.len(male_pronouns)

pronoun_features = [
    female_pronouns_count, male_pronouns_count,
    female_pronouns_count + male_pronouns_count, female_pronouns_count /
    modifiers.max(female_pronouns_count + male_pronouns_count, 1)
]

drafttopic = [w2v] + pronoun_features
articletopic = drafttopic
Example #35
0
from revscoring.features import wikitext, modifiers

article = [
    wikitext.revision.chars,
    wikitext.revision.content_chars,
    wikitext.revision.ref_tags,
    wikitext.revision.ref_tags / modifiers.max(wikitext.revision.content_chars, 1),
    wikitext.revision.wikilinks,
    wikitext.revision.wikilinks / modifiers.max(wikitext.revision.content_chars, 1),
    wikitext.revision.external_links,
    wikitext.revision.external_links / modifiers.max(wikitext.revision.content_chars, 1),
    wikitext.revision.headings_by_level(2),
    wikitext.revision.headings_by_level(2) /
        modifiers.max(wikitext.revision.content_chars, 1),
    wikitext.revision.headings_by_level(3),
    wikitext.revision.headings_by_level(3) /
        modifiers.max(wikitext.revision.content_chars, 1)
]
Example #36
0
                                    _process_all_sources,
                                    depends_on=[item])
all_sources = aggregators.len(all_sources_datasource)
"`int` : A count of all sources in the revision"

all_wikimedia_sources_datasource = Datasource(
    name + ".all_wikimedia_sources",
    _process_wikimedia_sources,
    depends_on=[all_sources_datasource])
all_wikimedia_sources = aggregators.len(all_wikimedia_sources_datasource)
"`int` : A count of all sources which come from Wikimedia projects in the revision"

all_external_sources = modifiers.sub(all_sources, all_wikimedia_sources)
"A count of all sources which do not come from Wikimedia projects in the revision"

external_sources_ratio = all_external_sources / modifiers.max(
    wikibase_features.revision.sources, 1)
"A ratio/division between number of external references and number of claims that have references in the revision"

unique_sources = Feature(name + ".unique_sources",
                         _process_unique_sources,
                         depends_on=[all_sources_datasource],
                         returns=int)
"`int` : A count of unique sources in the revision"

# Status
is_human = revision.has_property_value(properties.INSTANCE_OF,
                                       items.HUMAN,
                                       name=name + '.is_human')
has_birthday = revision.has_property(properties.DATE_OF_BIRTH,
                                     name='revision.has_birthday')
dead = revision.has_property(properties.DATE_OF_DEATH, name='revision.dead')