Esempio n. 1
0
from revscoring import Datasource, Feature
from revscoring.datasources import revision_oriented as ro
from revscoring.datasources.meta import dicts, frequencies, indexable, mappers
from revscoring.features import wikitext as wt
from revscoring.features.meta import aggregators, vectorizers

from . import enwiki, mediawiki, wikitext


def process_is_a_translation_page(namespace_id, title):
    return (namespace_id == 8 or namespace_id > 1200) and "/" in title


is_a_translation_page = Feature(
    "revision.page.is_a_translation_page", process_is_a_translation_page,
    returns=bool,
    depends_on=[ro.revision.page.namespace.id,
                ro.revision.page.title])


def process_is_a_default(text):
    return text == "-"


revision_is_a_default = Feature(
    "revision.is_a_default", process_is_a_default,
    returns=bool, depends_on=[ro.revision.text])

parent_was_a_default = Feature(
    "revision.parent.is_a_default", process_is_a_default,
    returns=bool, depends_on=[ro.revision.parent.text])
Esempio n. 2
0
def get_images(strs):
    """
    Parses a list of strings, expected to be tags or templates
    to get matching image patterns and returns a count of the matches.
    """
    matches = re.findall(r"image[1-9]{1,2}|File:|Image:|photo[1-9][a-z]",
                         "".join(strs), re.I)
    return len(matches)


image_templates_str = wikitext.revision.datasources.templates_str_matching(
    r"{{(multiple image|image array|gallery|photomontage)",
    name='enwiki.revision.image_templates_str')

images_in_templates = Feature("enwiki.revision.images_in_templates",
                              get_images,
                              depends_on=[image_templates_str],
                              returns=int)

image_tags_str = wikitext.revision.datasources.tags_str_matching(
    r"<(gallery|imagemap)", name='enwiki.revision.image_tags_str')

images_in_tags = Feature("enwiki.revision.images_in_tags",
                         get_images,
                         depends_on=[image_tags_str],
                         returns=int)


def get_infobox_images(strs):
    matches = re.findall(r"\.(jpg|jpeg|png|gif|svg|tiff|pdf|ogg|djvu)",
                         "".join(strs), re.I)
    return len(matches)
Esempio n. 3
0
external_identifiers = Datasource(
    name + ".revision.external_identifiers",
    _process_external_identifiers,
    depends_on=[wikibase_.revision.datasources.entity])


def _process_commons_media(entity):
    for pid in entity.properties.keys():
        if pid in property_datatypes.COMMONS_MEDIA:
            return True
    return False


has_commons_media = Feature(name + ".revision.has_commons_media",
                            _process_commons_media,
                            returns=bool,
                            depends_on=[wikibase_.revision.datasources.entity])


def _process_wikimedia_references(references):
    return [
        reference for reference in references
        if (reference.property == properties.IMPORTED_FROM_WIKIMEDIA)
    ]


wikimedia_references = Datasource(name + ".revision.wikimedia_references",
                                  _process_wikimedia_references,
                                  depends_on=[references])

Esempio n. 4
0
    current_properties = set(current_properties.keys())

    all_prob = 0.0
    present_prob = 0.0
    for statement in properties_suggested:
        all_prob += float(statement['rating'])
        if statement['id'] in current_properties:
            present_prob += float(statement['rating'])

    return present_prob / all_prob if all_prob else 0.0


item_completeness = Feature(
    name + '.revision.page.item_completeness',
    _process_item_completeness,
    returns=float,
    depends_on=[
        wikibase_.revision.datasources.properties,
        revision_oriented_datasources.revision.page.suggested.properties
    ])

# Status
is_human = wikibase_.revision.has_property_value(properties.INSTANCE_OF,
                                                 items.HUMAN,
                                                 name=name +
                                                 '.revision.is_human')
has_birthday = wikibase_.revision.has_property(properties.DATE_OF_BIRTH,
                                               name=name +
                                               '.revision.has_birthday')
dead = wikibase_.revision.has_property(properties.DATE_OF_DEATH,
                                       name=name + '.revision.dead')
is_blp = has_birthday.and_(not_(dead))
Esempio n. 5
0
import mwparserfromhell as mwp

from wordlist import WordList
from phraselist import PhraseList
from w2w_feature import W2WFeature

session = Session("https://en.wikipedia.org/w/api.php", user_agent="joe")
extractor = api.Extractor(session)


def puffery(segmentsAdded):
    wordList = WordList()
    wordList.parse("pufferyOutput.txt")
    segmentsAdded = mwp.parse(segmentsAdded)
    retScore = 0.0
    for key in wordList:
        count = segmentsAdded.count(key)
        count += segmentsAdded.count(key.title())
        if count > 0:
            retScore += count * wordList[key]
    return retScore


puffery = Feature(
    "puffery",
    puffery,
    depends_on=[wikitext.revision.diff.datasources.segments_added],
    returns=float)

print(extractor.extract(867964616, puffery))
Esempio n. 6
0
        return "0" + last_two
    else:
        return last_two


last_two_in_rev_id = Datasource("revision.last_two_in_rev_id",
                                process_last_two_in_rev_id,
                                depends_on=[revision.id])


def process_reversed_last_two_in_rev_id(last_two):
    return int("".join(reversed(last_two)))


reversed_last_two_in_rev_id = Feature("revision.reversed_last_two_in_rev_id",
                                      process_reversed_last_two_in_rev_id,
                                      returns=int,
                                      depends_on=[last_two_in_rev_id])


def process_delay():
    return 0.0


delay = Feature("delay", process_delay, returns=float)


class RevIdScorer(Model):
    """
    Implements a basic, testing scorer that predicts whether a revision ID's
    reversed last two digits are greater than 50.
Esempio n. 7
0
sentiment_score = Datasource("english.sentiment.revision.polarity_score",
                             get_polarity_score,
                             depends_on=[english.stopwords.revision.datasources.non_stopwords])  # noqa: E501


def get_positive_score(senti_score):
    return senti_score[0]


def get_negative_score(senti_score):
    return senti_score[1]


positive_polarity = Feature(
    "english.sentiment.revision.positive_polarity",
    get_positive_score,
    depends_on=[sentiment_score],
    returns=float
)

negative_polarity = Feature(
    "english.sentiment.revision.negative_polarity",
    get_negative_score,
    depends_on=[sentiment_score],
    returns=float
)

diff_polarity = sub(positive_polarity,
                    negative_polarity,
                    name="english.sentiment.revision.diff_polarity")

char_based = [