Beispiel #1
0
def test_datasource():

    d = Datasource("d")

    assert pickle.loads(pickle.dumps(d)) == d

    assert solve(d, cache={d: "foo"}) == "foo"

    assert solve(d, cache={"datasource.d": "foo"}) == "foo"

    assert str(d) == "datasource.d"
    assert repr(d) == "<datasource.d>"
Beispiel #2
0
import pickle

from revscoring.datasources.datasource import Datasource
from revscoring.datasources.meta import hashing
from revscoring.dependencies import solve

my_tokens = Datasource("my_tokens")
my_hashes = hashing.hash(my_tokens, n=10)


def test_hashing():
    hashes = solve(my_hashes,
                   cache={my_tokens: [("one", "two"), "two", "three", "four"]})

    assert len(hashes) == 4
    assert max(hashes) <= 10, str(max(hashes))

    hashes_again = solve(
        my_hashes, cache={my_tokens: [("one", "two"), "two", "three", "four"]})

    assert hashes == hashes_again

    assert (pickle.loads(pickle.dumps(my_hashes)) == my_hashes)
Beispiel #3
0
import pickle

from revscoring.datasources.datasource import Datasource
from revscoring.datasources.meta import mappers
from revscoring.dependencies import solve

tokens = Datasource("tokens")
my_ints = Datasource("my_ints")


def extract_first_char(token):
    return token[:1]


first_char = mappers.map(extract_first_char, tokens, name="first_char")

lower_case_tokens = mappers.lower_case(tokens, name="lower_case_tokens")

derepeat_tokens = mappers.derepeat(tokens, name="derepeat_tokens")

de1337_tokens = mappers.de1337(tokens, name="de1337_tokens")

abs_ints = mappers.abs(my_ints)


def test_item_mapper():
    cache = {tokens: ["alpha", "bravo", "charlie", "delta"]}
    assert (solve(first_char, cache=cache) ==
            ["a", "b", "c", "d"])

    assert pickle.loads(pickle.dumps(first_char)) == first_char
Beispiel #4
0
    Mapping of english descriptions to item idenifiers
    """
    HUMAN = 'Q5'


def _process_source_claims(item):
    return [
        source_claim for pid, claims in item.claims.items() for claim in claims
        for source in claim.sources
        for source_pid, source_claims in source.items()
        for source_claim in source_claims
    ]


source_claims = Datasource(name + ".revision.source_claims",
                           _process_source_claims,
                           depends_on=[wikibase_.revision.datasources.item])


def _process_wikimedia_sources(source_claims):
    return [
        source_claim for source_claim in source_claims
        if isinstance(source_claim.target, pywikibase.ItemPage)
        and source_claim.target.id in wikimedia.PROJECT_QIDS
    ]


wikimedia_sources = Datasource(name + ".revision.wikimedia_sources",
                               _process_wikimedia_sources,
                               depends_on=[source_claims])
Beispiel #5
0
import pickle

from revscoring.datasources.datasource import Datasource
from revscoring.datasources.meta import extractors
from revscoring.dependencies import solve


def return_foo():
    return "foo"


segments = Datasource("segments")

text = Datasource("text")

text_extractor = extractors.regex(["foo bar", "bar foo"],
                                  text,
                                  name="text_extractor")

exclusion_text_extractor = extractors.regex(["foo+"],
                                            text,
                                            name="text_extractor",
                                            exclusions=['foooo'])

segment_extractor = extractors.regex(["foo bar", "bar foo"],
                                     segments,
                                     name="text_extractor")


def test_text_extractor():
    cache = {text: "This is some text foo bar nope bar foo"}
Beispiel #6
0
import pickle

from revscoring.datasources.datasource import Datasource
from revscoring.datasources.meta import dicts
from revscoring.dependencies import solve

my_dict = Datasource("my_dict")

my_keys = dicts.keys(my_dict)
my_values = dicts.values(my_dict)


def test_dict_keys():
    cache = {my_dict: {"foo": 1, "bar": 2}}
    assert set(solve(my_keys, cache=cache)) == {"foo", "bar"}
    cache = {my_dict: None}
    assert set(solve(my_keys, cache=cache)) == set()

    assert pickle.loads(pickle.dumps(my_keys)) == my_keys


def test_dict_values():
    cache = {my_dict: {"foo": 1, "bar": 2}}
    assert set(solve(my_values, cache=cache)) == {1, 2}
    cache = {my_dict: None}
    assert set(solve(my_values, cache=cache)) == set()

    assert pickle.loads(pickle.dumps(my_values)) == my_values
    """
    Mapping of english descriptions to item idenifiers
    """
    HUMAN = 'Q5'


def _process_references(entity):
    return [reference
            for pid, statements in entity.properties.items()
            for statement in statements
            for pid, references in statement.references.items()
            for reference in references]


references = Datasource(
    name + ".revision.references",
    _process_references,
    depends_on=[wikibase_.revision.datasources.entity])


def _process_wikimedia_references(references):
    return [reference
            for reference in references
            if (reference.datatype == 'wikibase-entityid' and
                reference.datavalue.id in wikimedia.PROJECT_QIDS)]


wikimedia_references = Datasource(
    name + ".revision.wikimedia_references",
    _process_wikimedia_references, depends_on=[references])

import pickle

from revscoring.datasources.datasource import Datasource
from revscoring.datasources.meta import frequencies
from revscoring.dependencies import solve

old_tokens = Datasource("old_tokens")
new_tokens = Datasource("new_tokens")

old_ft = frequencies.table(old_tokens, name="old_ft")
new_ft = frequencies.table(new_tokens, name="new_ft")

delta = frequencies.delta(old_ft, new_ft, name="delta")
pos_delta = frequencies.positive(delta, name="pos_delta")
neg_delta = frequencies.negative(delta, name="neg_delta")
neg_abs_delta = frequencies.negative(
    delta, absolute=True, name="neg_abs_delta")

prop_delta = frequencies.prop_delta(old_ft, delta, name="prop_delta")


def test_table():
    cache = {new_tokens: ["a"] * 3 + ["b"] * 2 + ["c"] * 45}
    assert (solve(new_ft, cache=cache) ==
            {'a': 3, 'b': 2, 'c': 45})

    assert (pickle.loads(pickle.dumps(new_ft)) ==
            new_ft)


def test_delta():
Beispiel #9
0
        result_set.add(label)

    return len(result_set) / 8


def _process_important_translations_descriptions(item_descriptions):
    result_set = set()

    for description in (item_descriptions.keys() & IMPORTANT_LANG_CODES):
        result_set.add(description)

    return len(result_set) / 8


item_doc = Datasource(name + ".item_doc",
                      _process_item_doc,
                      depends_on=[revision_oriented.revision.text])
"""A JSONable `dict` of content for a Wikibase content."""

item = Datasource(name + ".item", _process_item, depends_on=[item_doc])
"""A `~pywikibase.Item` for the Wikibase content"""

item_labels_datasource = Datasource(name + ".labels",
                                    _process_labels,
                                    depends_on=[item])
item_descriptions_datasource = Datasource(name + ".descriptions",
                                          _process_descriptions,
                                          depends_on=[item])
complete_translations = Feature(
    name + ".complete_translations",
    _process_complete_translations,