Esempio n. 1
0
def test_scoring_context():
    from revscoring.datasources import Datasource
    from revscoring.dependencies import Dependent
    from revscoring.features import Feature

    fake_data = Datasource("fake_data", lambda: "fake")
    len_func = Dependent("len_func")
    literal_fake = Dependent("literal_fake")
    characters = Feature("characters",
                         lambda word, len: len(word),
                         returns=int,
                         depends_on=[fake_data, len_func])
    is_fake = Feature("is_fake",
                      lambda word, fake: word == fake,
                      returns=bool,
                      depends_on=[fake_data, literal_fake])

    FakeExtractor = namedtuple("Extractor", ['extract', 'solve', 'language'])

    def fake_extract(rev_ids, dependents, caches=None):
        caches = caches or defaultdict(dict)
        for rev_id in rev_ids:
            cache = caches[rev_id]
            if rev_id % 5 != 0:
                values = dependencies.solve(dependents,
                                            context={len_func: lambda: len},
                                            cache=cache)
                yield None, list(values)
            else:
                yield RuntimeError("extract"), None

    def fake_solve(dependents, cache=None):
        cache = cache or {}
        cache.update({len_func: len, literal_fake: "fake"})
        return dependencies.solve(dependents, cache=cache)

    extractor = FakeExtractor(fake_extract, fake_solve, None)

    FakeScorerModel = namedtuple("FakeScorerModel",
                                 ['score', 'version', 'language', 'features'])
    scorer_model = FakeScorerModel(lambda fvs: {"prediction": "generated"},
                                   "1", None, [characters, is_fake])

    scoring_context = ScoringContext("fakewiki", {"fake": scorer_model},
                                     extractor)

    rev_ids = [1, 2, 3, 4, 5]
    root_ds_caches = scoring_context.extract_roots("fake", rev_ids)
    eq_(len(root_ds_caches), 5)
    eq_(root_ds_caches[1][1][fake_data], "fake")
    assert root_ds_caches[5][0] is not None

    score, feature_vals = scoring_context.score("fake", {
        characters: 10,
        is_fake: False
    })
    eq_(score['prediction'], "generated")
Esempio n. 2
0
def test_score():
    skc = FakeIdentityClassifier([Feature("foo")], [True, False],
                                 version="0.0.1")
    docs = skc.score_many([cv_feature_values[0][0]])
    assert len(docs) == 1

    skc = FakeIdentityProbabilityClassifier([Feature("foo")], [True, False],
                                            version="0.0.1")
    docs = skc.score_many([cv_feature_values[0][0]])
    assert len(docs) == 1
    assert 'probability' in docs[0]
Esempio n. 3
0
def test_score_many():
    skc = FakeIdentityClassifier([Feature("foo")], [True, False],
                                 version="0.0.1")
    features, labels = zip(*cv_feature_values)
    docs = skc.score_many(features)
    assert len(docs) == 10

    skc = FakeIdentityProbabilityClassifier([Feature("foo")], [True, False],
                                            version="0.0.1")
    features, labels = zip(*cv_feature_values)
    docs = skc.score_many(features)
    assert len(docs) == 10
    assert 'probability' in docs[0]
Esempio n. 4
0
def test_sklearn_classifier():
    skc = FakeIdentityClassifier(
        [Feature("foo", returns=int)], [True, False], version="0.0.1")

    assert skc.version == "0.0.1"

    stats = skc.cross_validate(cv_feature_values, folds=2)
    assert (stats['counts']['predictions'] ==
            {True: {False: 0, True: 5},
             False: {False: 5, True: 0}})
Esempio n. 5
0
def test_sklearn_probabilityclassifier():
    skc = FakeIdentityProbabilityClassifier(
        [Feature("foo", returns=int)], [True, False], version="0.0.1")

    assert skc.version == "0.0.1"

    stats = skc.cross_validate(cv_feature_values, folds=2)
    assert (stats['counts']['predictions'] ==
            {True: {False: 0, True: 5},
             False: {False: 5, True: 0}})
    assert (skc.info['score_schema']['properties']['prediction']['type'] ==
            "boolean")
Esempio n. 6
0
def test_sklearn_probabilityclassifier_multilabel():
    skc = FakeIdentityProbabilityClassifierMultilabel(
        [Feature("foo", returns=int)], ["A", "B"], multilabel=True,
        version="0.0.1", label_weights={"A": 5, "B": 0.5})
    expected_estimator_params = {'class_weight':
                                 [{0: 1, 1: 5}, {0: 1, 1: 0.5}]}
    expected_counts = {"A": {True: {True: 3, False: 3},
                             False: {True: 2, False: 2}},
                       "B": {True: {True: 3, False: 3},
                             False: {True: 2, False: 2}}}
    stats = skc.cross_validate(cv_feature_values_multilabel, folds=2)
    assert expected_estimator_params == skc.estimator_params
    assert expected_counts == stats['counts']['predictions']
    assert skc.estimator_params == expected_estimator_params
Esempio n. 7
0
def test_sklearn_classifier_multilabel():
    skc = FakeIdentityClassifierMultilabel([Feature("foo")], ["A", "B"],
                                           multilabel=True,
                                           version="0.0.1",
                                           label_weights={
                                               "A": 5,
                                               "B": 0.5
                                           })
    expected_estimator_params = {
        'class_weight': [{
            0: 1,
            1: 5
        }, {
            0: 1,
            1: 0.5
        }]
    }
    expected_counts = {
        "A": {
            True: {
                True: 3,
                False: 3
            },
            False: {
                True: 2,
                False: 2
            }
        },
        "B": {
            True: {
                True: 3,
                False: 3
            },
            False: {
                True: 2,
                False: 2
            }
        }
    }
    stats = skc.cross_validate(cv_feature_values_multilabel, folds=2)
    assert expected_estimator_params == skc.estimator_params
    assert expected_counts == stats['counts']['predictions']
    assert skc.estimator_params == expected_estimator_params
    assert (skc.info['score_schema']['properties']['prediction']['type'] ==
            "array")
Esempio n. 8
0
def test_sklearn_format_error():
    with raises(ValueError):
        skc = FakeIdentityClassifier([Feature("foo")], [True, False],
                                     version="0.0.1")
        skc.info.format(formatting="foo")
Esempio n. 9
0
    sub(wikitext.revision.headings,
        wikitext.revision.parent.headings,
        name="revision.diff.headings_change"),
    sub(wikitext.revision.external_links,
        wikitext.revision.parent.external_links,
        name="revision.diff.external_links_change"),
    sub(wikitext.revision.wikilinks,
        wikitext.revision.parent.wikilinks,
        name="revision.diff.wikilinks_change"),
    sub(wikitext.revision.templates,
        wikitext.revision.parent.templates,
        name="revision.diff.templates_change"),
    sub(wikitext.revision.ref_tags,
        wikitext.revision.parent.ref_tags,
        name="revision.diff.ref_tags_change"),
    Feature("revision.diff.longest_new_token",
            _process_new_longest,
            returns=int,
            depends_on=[
                wikitext.revision.parent.longest_token,
                wikitext.revision.longest_token
            ]),
    Feature("revision.diff.longest_new_repeated_char",
            _process_new_longest,
            returns=int,
            depends_on=[
                wikitext.revision.parent.longest_repeated_char,
                wikitext.revision.longest_repeated_char
            ])
]
Esempio n. 10
0
def test_scoring_context():
    from revscoring.datasources import Datasource
    from revscoring.dependencies import Dependent
    from revscoring.features import Feature

    fake_data = Datasource("fake_data", lambda: "fake")
    len_func = Dependent("len_func")
    literal_fake = Dependent("literal_fake")
    characters = Feature("characters",
                         lambda word, len: len(word),
                         returns=int,
                         depends_on=[fake_data, len_func])
    is_fake = Feature("is_fake",
                      lambda word, fake: word == fake,
                      returns=bool,
                      depends_on=[fake_data, literal_fake])

    FakeExtractor = namedtuple("Extractor", ['extract', 'solve', 'language'])

    def fake_extract(rev_ids, dependents, caches=None):
        caches = caches if caches is not None else {}
        for rev_id in rev_ids:
            if rev_id % 5 != 0:
                cache = caches.get(rev_id, {})
                values = dependencies.solve(dependents,
                                            context={len_func: lambda: len},
                                            cache=cache)
                values = list(values)
                caches[rev_id] = cache
                yield None, values
            else:
                yield RuntimeError("extract"), None

    def fake_solve(dependents, cache=None):
        cache = cache if cache is not None else {}
        cache.update({len_func: len, literal_fake: "fake"})
        return dependencies.solve(dependents, cache=cache)

    extractor = FakeExtractor(fake_extract, fake_solve, None)

    FakeScorerModel = namedtuple("FakeScorerModel",
                                 ['score', 'version', 'language', 'features'])
    scorer_model = FakeScorerModel(lambda fvs: {"prediction": "generated"},
                                   "1", None, [characters, is_fake])

    scoring_context = ScoringContext("fakewiki", {"fake": scorer_model},
                                     extractor)

    rev_ids = [1, 2, 3, 4, 5]
    root_ds_caches, errors = scoring_context.extract_root_dependency_caches(
        ["fake"], rev_ids)
    print(root_ds_caches)
    print(errors)
    assert len(root_ds_caches) == 4
    assert len(errors) == 1
    assert root_ds_caches[1][fake_data] == "fake"
    assert 5 in errors

    score = scoring_context.process_model_scores(["fake"], {
        characters: 10,
        is_fake: False
    })
    assert score['fake']['score']['prediction'] == "generated"
Esempio n. 11
0
import time

from nose.tools import eq_, nottest
from revscoring import Extractor, Model
from revscoring.features import Feature
from revscoring.scoring import ModelInfo

from ...score_request import ScoreRequest
from ...scoring_context import ScoringContext

wait_time = Feature("wait_time", returns=float)


def process_wait(wait_time):
    time.sleep(wait_time)
    return wait_time


wait = Feature("wait",
               process=process_wait,
               returns=float,
               depends_on=[wait_time])


class FakeSM(Model):
    def __init__(self):
        self.features = [wait]
        self.version = "fake version"
        self.info = ModelInfo()
        self.info['version'] = self.version
Esempio n. 12
0
def test_model():
    m = Model([Feature("foo")], version="0.0.1")

    assert m.info.lookup('version') == "0.0.1"
Esempio n. 13
0
def test_classifier():
    model = Classifier([Feature("foo")], [True, False])
    assert 'statustics' not in model.info
Esempio n. 14
0
def test_learned_model():
    model = Learned([Feature("foo")])
    assert model.trained is None
Esempio n. 15
0
import json
import random
from io import BytesIO
from itertools import chain

from pytest import mark

from revscoring.features import Feature, FeatureVector
from revscoring.scoring.models.model import Model


def process_float():
    return float()


some_float = Feature("some_float", process_float(),
                     depends_on=[], returns=float)


def process_other_float():
    return float()


other_float = Feature("other_float", process_other_float(),
                      depends_on=[], returns=float)


def process_float_vector():
    return [float(), float(), float()]


float_vector = FeatureVector("float_vector", process_float_vector(),
Esempio n. 16
0
from revscoring import ScorerModel
from revscoring.datasources.revision_oriented import revision
from revscoring.features import Feature


def process_reversed_last_two_in_rev_id(rev_id):
    last_two = str(rev_id)[-2:]
    if len(last_two) == 1:
        return int(last_two + "0")
    else:
        return int("".join(reversed(last_two)))


reversed_last_two_in_rev_id = Feature("revision.reversed_last_two_in_rev_id",
                                      process_reversed_last_two_in_rev_id,
                                      returns=int,
                                      depends_on=[revision.id])


def process_delay():
    return 0.0


delay = Feature("delay", process_delay, returns=float)


class RevIdScorer(ScorerModel):
    """
    Implements a basic, testing scorer that predicts whether a revision ID's
    reversed last two digits are greater than 50.