from revscoring import Datasource, Feature from revscoring.datasources import revision_oriented as ro from revscoring.datasources.meta import dicts, frequencies, indexable, mappers from revscoring.features import wikitext as wt from revscoring.features.meta import aggregators, vectorizers from . import enwiki, mediawiki, wikitext def process_is_a_translation_page(namespace_id, title): return (namespace_id == 8 or namespace_id > 1200) and "/" in title is_a_translation_page = Feature( "revision.page.is_a_translation_page", process_is_a_translation_page, returns=bool, depends_on=[ro.revision.page.namespace.id, ro.revision.page.title]) def process_is_a_default(text): return text == "-" revision_is_a_default = Feature( "revision.is_a_default", process_is_a_default, returns=bool, depends_on=[ro.revision.text]) parent_was_a_default = Feature( "revision.parent.is_a_default", process_is_a_default, returns=bool, depends_on=[ro.revision.parent.text])
def get_images(strs): """ Parses a list of strings, expected to be tags or templates to get matching image patterns and returns a count of the matches. """ matches = re.findall(r"image[1-9]{1,2}|File:|Image:|photo[1-9][a-z]", "".join(strs), re.I) return len(matches) image_templates_str = wikitext.revision.datasources.templates_str_matching( r"{{(multiple image|image array|gallery|photomontage)", name='enwiki.revision.image_templates_str') images_in_templates = Feature("enwiki.revision.images_in_templates", get_images, depends_on=[image_templates_str], returns=int) image_tags_str = wikitext.revision.datasources.tags_str_matching( r"<(gallery|imagemap)", name='enwiki.revision.image_tags_str') images_in_tags = Feature("enwiki.revision.images_in_tags", get_images, depends_on=[image_tags_str], returns=int) def get_infobox_images(strs): matches = re.findall(r"\.(jpg|jpeg|png|gif|svg|tiff|pdf|ogg|djvu)", "".join(strs), re.I) return len(matches)
external_identifiers = Datasource( name + ".revision.external_identifiers", _process_external_identifiers, depends_on=[wikibase_.revision.datasources.entity]) def _process_commons_media(entity): for pid in entity.properties.keys(): if pid in property_datatypes.COMMONS_MEDIA: return True return False has_commons_media = Feature(name + ".revision.has_commons_media", _process_commons_media, returns=bool, depends_on=[wikibase_.revision.datasources.entity]) def _process_wikimedia_references(references): return [ reference for reference in references if (reference.property == properties.IMPORTED_FROM_WIKIMEDIA) ] wikimedia_references = Datasource(name + ".revision.wikimedia_references", _process_wikimedia_references, depends_on=[references])
current_properties = set(current_properties.keys()) all_prob = 0.0 present_prob = 0.0 for statement in properties_suggested: all_prob += float(statement['rating']) if statement['id'] in current_properties: present_prob += float(statement['rating']) return present_prob / all_prob if all_prob else 0.0 item_completeness = Feature( name + '.revision.page.item_completeness', _process_item_completeness, returns=float, depends_on=[ wikibase_.revision.datasources.properties, revision_oriented_datasources.revision.page.suggested.properties ]) # Status is_human = wikibase_.revision.has_property_value(properties.INSTANCE_OF, items.HUMAN, name=name + '.revision.is_human') has_birthday = wikibase_.revision.has_property(properties.DATE_OF_BIRTH, name=name + '.revision.has_birthday') dead = wikibase_.revision.has_property(properties.DATE_OF_DEATH, name=name + '.revision.dead') is_blp = has_birthday.and_(not_(dead))
import mwparserfromhell as mwp from wordlist import WordList from phraselist import PhraseList from w2w_feature import W2WFeature session = Session("https://en.wikipedia.org/w/api.php", user_agent="joe") extractor = api.Extractor(session) def puffery(segmentsAdded): wordList = WordList() wordList.parse("pufferyOutput.txt") segmentsAdded = mwp.parse(segmentsAdded) retScore = 0.0 for key in wordList: count = segmentsAdded.count(key) count += segmentsAdded.count(key.title()) if count > 0: retScore += count * wordList[key] return retScore puffery = Feature( "puffery", puffery, depends_on=[wikitext.revision.diff.datasources.segments_added], returns=float) print(extractor.extract(867964616, puffery))
return "0" + last_two else: return last_two last_two_in_rev_id = Datasource("revision.last_two_in_rev_id", process_last_two_in_rev_id, depends_on=[revision.id]) def process_reversed_last_two_in_rev_id(last_two): return int("".join(reversed(last_two))) reversed_last_two_in_rev_id = Feature("revision.reversed_last_two_in_rev_id", process_reversed_last_two_in_rev_id, returns=int, depends_on=[last_two_in_rev_id]) def process_delay(): return 0.0 delay = Feature("delay", process_delay, returns=float) class RevIdScorer(Model): """ Implements a basic, testing scorer that predicts whether a revision ID's reversed last two digits are greater than 50.
sentiment_score = Datasource("english.sentiment.revision.polarity_score", get_polarity_score, depends_on=[english.stopwords.revision.datasources.non_stopwords]) # noqa: E501 def get_positive_score(senti_score): return senti_score[0] def get_negative_score(senti_score): return senti_score[1] positive_polarity = Feature( "english.sentiment.revision.positive_polarity", get_positive_score, depends_on=[sentiment_score], returns=float ) negative_polarity = Feature( "english.sentiment.revision.negative_polarity", get_negative_score, depends_on=[sentiment_score], returns=float ) diff_polarity = sub(positive_polarity, negative_polarity, name="english.sentiment.revision.diff_polarity") char_based = [