def test_scoring_context(): from revscoring.datasources import Datasource from revscoring.dependencies import Dependent from revscoring.features import Feature fake_data = Datasource("fake_data", lambda: "fake") len_func = Dependent("len_func") literal_fake = Dependent("literal_fake") characters = Feature("characters", lambda word, len: len(word), returns=int, depends_on=[fake_data, len_func]) is_fake = Feature("is_fake", lambda word, fake: word == fake, returns=bool, depends_on=[fake_data, literal_fake]) FakeExtractor = namedtuple("Extractor", ['extract', 'solve', 'language']) def fake_extract(rev_ids, dependents, caches=None): caches = caches or defaultdict(dict) for rev_id in rev_ids: cache = caches[rev_id] if rev_id % 5 != 0: values = dependencies.solve(dependents, context={len_func: lambda: len}, cache=cache) yield None, list(values) else: yield RuntimeError("extract"), None def fake_solve(dependents, cache=None): cache = cache or {} cache.update({len_func: len, literal_fake: "fake"}) return dependencies.solve(dependents, cache=cache) extractor = FakeExtractor(fake_extract, fake_solve, None) FakeScorerModel = namedtuple("FakeScorerModel", ['score', 'version', 'language', 'features']) scorer_model = FakeScorerModel(lambda fvs: {"prediction": "generated"}, "1", None, [characters, is_fake]) scoring_context = ScoringContext("fakewiki", {"fake": scorer_model}, extractor) rev_ids = [1, 2, 3, 4, 5] root_ds_caches = scoring_context.extract_roots("fake", rev_ids) eq_(len(root_ds_caches), 5) eq_(root_ds_caches[1][1][fake_data], "fake") assert root_ds_caches[5][0] is not None score, feature_vals = scoring_context.score("fake", { characters: 10, is_fake: False }) eq_(score['prediction'], "generated")
def test_score(): skc = FakeIdentityClassifier([Feature("foo")], [True, False], version="0.0.1") docs = skc.score_many([cv_feature_values[0][0]]) assert len(docs) == 1 skc = FakeIdentityProbabilityClassifier([Feature("foo")], [True, False], version="0.0.1") docs = skc.score_many([cv_feature_values[0][0]]) assert len(docs) == 1 assert 'probability' in docs[0]
def test_score_many(): skc = FakeIdentityClassifier([Feature("foo")], [True, False], version="0.0.1") features, labels = zip(*cv_feature_values) docs = skc.score_many(features) assert len(docs) == 10 skc = FakeIdentityProbabilityClassifier([Feature("foo")], [True, False], version="0.0.1") features, labels = zip(*cv_feature_values) docs = skc.score_many(features) assert len(docs) == 10 assert 'probability' in docs[0]
def test_sklearn_classifier(): skc = FakeIdentityClassifier( [Feature("foo", returns=int)], [True, False], version="0.0.1") assert skc.version == "0.0.1" stats = skc.cross_validate(cv_feature_values, folds=2) assert (stats['counts']['predictions'] == {True: {False: 0, True: 5}, False: {False: 5, True: 0}})
def test_sklearn_probabilityclassifier(): skc = FakeIdentityProbabilityClassifier( [Feature("foo", returns=int)], [True, False], version="0.0.1") assert skc.version == "0.0.1" stats = skc.cross_validate(cv_feature_values, folds=2) assert (stats['counts']['predictions'] == {True: {False: 0, True: 5}, False: {False: 5, True: 0}}) assert (skc.info['score_schema']['properties']['prediction']['type'] == "boolean")
def test_sklearn_probabilityclassifier_multilabel(): skc = FakeIdentityProbabilityClassifierMultilabel( [Feature("foo", returns=int)], ["A", "B"], multilabel=True, version="0.0.1", label_weights={"A": 5, "B": 0.5}) expected_estimator_params = {'class_weight': [{0: 1, 1: 5}, {0: 1, 1: 0.5}]} expected_counts = {"A": {True: {True: 3, False: 3}, False: {True: 2, False: 2}}, "B": {True: {True: 3, False: 3}, False: {True: 2, False: 2}}} stats = skc.cross_validate(cv_feature_values_multilabel, folds=2) assert expected_estimator_params == skc.estimator_params assert expected_counts == stats['counts']['predictions'] assert skc.estimator_params == expected_estimator_params
def test_sklearn_classifier_multilabel(): skc = FakeIdentityClassifierMultilabel([Feature("foo")], ["A", "B"], multilabel=True, version="0.0.1", label_weights={ "A": 5, "B": 0.5 }) expected_estimator_params = { 'class_weight': [{ 0: 1, 1: 5 }, { 0: 1, 1: 0.5 }] } expected_counts = { "A": { True: { True: 3, False: 3 }, False: { True: 2, False: 2 } }, "B": { True: { True: 3, False: 3 }, False: { True: 2, False: 2 } } } stats = skc.cross_validate(cv_feature_values_multilabel, folds=2) assert expected_estimator_params == skc.estimator_params assert expected_counts == stats['counts']['predictions'] assert skc.estimator_params == expected_estimator_params assert (skc.info['score_schema']['properties']['prediction']['type'] == "array")
def test_sklearn_format_error(): with raises(ValueError): skc = FakeIdentityClassifier([Feature("foo")], [True, False], version="0.0.1") skc.info.format(formatting="foo")
sub(wikitext.revision.headings, wikitext.revision.parent.headings, name="revision.diff.headings_change"), sub(wikitext.revision.external_links, wikitext.revision.parent.external_links, name="revision.diff.external_links_change"), sub(wikitext.revision.wikilinks, wikitext.revision.parent.wikilinks, name="revision.diff.wikilinks_change"), sub(wikitext.revision.templates, wikitext.revision.parent.templates, name="revision.diff.templates_change"), sub(wikitext.revision.ref_tags, wikitext.revision.parent.ref_tags, name="revision.diff.ref_tags_change"), Feature("revision.diff.longest_new_token", _process_new_longest, returns=int, depends_on=[ wikitext.revision.parent.longest_token, wikitext.revision.longest_token ]), Feature("revision.diff.longest_new_repeated_char", _process_new_longest, returns=int, depends_on=[ wikitext.revision.parent.longest_repeated_char, wikitext.revision.longest_repeated_char ]) ]
def test_scoring_context(): from revscoring.datasources import Datasource from revscoring.dependencies import Dependent from revscoring.features import Feature fake_data = Datasource("fake_data", lambda: "fake") len_func = Dependent("len_func") literal_fake = Dependent("literal_fake") characters = Feature("characters", lambda word, len: len(word), returns=int, depends_on=[fake_data, len_func]) is_fake = Feature("is_fake", lambda word, fake: word == fake, returns=bool, depends_on=[fake_data, literal_fake]) FakeExtractor = namedtuple("Extractor", ['extract', 'solve', 'language']) def fake_extract(rev_ids, dependents, caches=None): caches = caches if caches is not None else {} for rev_id in rev_ids: if rev_id % 5 != 0: cache = caches.get(rev_id, {}) values = dependencies.solve(dependents, context={len_func: lambda: len}, cache=cache) values = list(values) caches[rev_id] = cache yield None, values else: yield RuntimeError("extract"), None def fake_solve(dependents, cache=None): cache = cache if cache is not None else {} cache.update({len_func: len, literal_fake: "fake"}) return dependencies.solve(dependents, cache=cache) extractor = FakeExtractor(fake_extract, fake_solve, None) FakeScorerModel = namedtuple("FakeScorerModel", ['score', 'version', 'language', 'features']) scorer_model = FakeScorerModel(lambda fvs: {"prediction": "generated"}, "1", None, [characters, is_fake]) scoring_context = ScoringContext("fakewiki", {"fake": scorer_model}, extractor) rev_ids = [1, 2, 3, 4, 5] root_ds_caches, errors = scoring_context.extract_root_dependency_caches( ["fake"], rev_ids) print(root_ds_caches) print(errors) assert len(root_ds_caches) == 4 assert len(errors) == 1 assert root_ds_caches[1][fake_data] == "fake" assert 5 in errors score = scoring_context.process_model_scores(["fake"], { characters: 10, is_fake: False }) assert score['fake']['score']['prediction'] == "generated"
import time from nose.tools import eq_, nottest from revscoring import Extractor, Model from revscoring.features import Feature from revscoring.scoring import ModelInfo from ...score_request import ScoreRequest from ...scoring_context import ScoringContext wait_time = Feature("wait_time", returns=float) def process_wait(wait_time): time.sleep(wait_time) return wait_time wait = Feature("wait", process=process_wait, returns=float, depends_on=[wait_time]) class FakeSM(Model): def __init__(self): self.features = [wait] self.version = "fake version" self.info = ModelInfo() self.info['version'] = self.version
def test_model(): m = Model([Feature("foo")], version="0.0.1") assert m.info.lookup('version') == "0.0.1"
def test_classifier(): model = Classifier([Feature("foo")], [True, False]) assert 'statustics' not in model.info
def test_learned_model(): model = Learned([Feature("foo")]) assert model.trained is None
import json import random from io import BytesIO from itertools import chain from pytest import mark from revscoring.features import Feature, FeatureVector from revscoring.scoring.models.model import Model def process_float(): return float() some_float = Feature("some_float", process_float(), depends_on=[], returns=float) def process_other_float(): return float() other_float = Feature("other_float", process_other_float(), depends_on=[], returns=float) def process_float_vector(): return [float(), float(), float()] float_vector = FeatureVector("float_vector", process_float_vector(),
from revscoring import ScorerModel from revscoring.datasources.revision_oriented import revision from revscoring.features import Feature def process_reversed_last_two_in_rev_id(rev_id): last_two = str(rev_id)[-2:] if len(last_two) == 1: return int(last_two + "0") else: return int("".join(reversed(last_two))) reversed_last_two_in_rev_id = Feature("revision.reversed_last_two_in_rev_id", process_reversed_last_two_in_rev_id, returns=int, depends_on=[revision.id]) def process_delay(): return 0.0 delay = Feature("delay", process_delay, returns=float) class RevIdScorer(ScorerModel): """ Implements a basic, testing scorer that predicts whether a revision ID's reversed last two digits are greater than 50.