Ejemplo n.º 1
0
def tokenized(text_datasource, name=None, tok_strategy="Latin"):
    """
    Constructs a :class:`revision.Datasource` that generates a list of tokens
    """
    if name is None:
        name = "{0}({1!r}, {2!r})".format("tokenized", text_datasource,
                                          tok_strategy)

    if tok_strategy == "Latin":
        return Datasource(name, _process_tokens, depends_on=[text_datasource])
    elif tok_strategy == "CJK":
        return Datasource(name,
                          _process_tokens_cjk,
                          depends_on=[text_datasource])
    else:
        raise NotImplementedError
Ejemplo n.º 2
0
    def __init__(self, name, revision_datasources):
        super().__init__(name, revision_datasources)

        self.sentences = Datasource(
            self._name + ".sentences", psw2sentences,
            depends_on=[self.paragraphs_sentences_and_whitespace]
        )
        """
Ejemplo n.º 3
0
def tokenized(text_datasource, name=None):
    """
    Constructs a :class:`revision.Datasource` that generates a list of tokens
    """
    if name is None:
        name = "{0}({1})".format("tokenized", text_datasource)

    return Datasource(name, _process_tokens, depends_on=[text_datasource])
Ejemplo n.º 4
0
    def __init__(self, prefix, revision_datasources):

        self.bytes = Datasource(prefix + ".bytes",
                                _process_bytes,
                                depends_on=[revision_datasources.text])

        if hasattr(revision_datasources, "parent"):
            self.parent = Revision(prefix + ".parent",
                                   revision_datasources.parent)
Ejemplo n.º 5
0
def test_scoring_context():
    from revscoring.datasources import Datasource
    from revscoring.dependencies import Dependent
    from revscoring.features import Feature

    fake_data = Datasource("fake_data", lambda: "fake")
    len_func = Dependent("len_func")
    literal_fake = Dependent("literal_fake")
    characters = Feature("characters",
                         lambda word, len: len(word),
                         returns=int,
                         depends_on=[fake_data, len_func])
    is_fake = Feature("is_fake",
                      lambda word, fake: word == fake,
                      returns=bool,
                      depends_on=[fake_data, literal_fake])

    FakeExtractor = namedtuple("Extractor", ['extract', 'solve', 'language'])

    def fake_extract(rev_ids, dependents, caches=None):
        caches = caches or defaultdict(dict)
        for rev_id in rev_ids:
            cache = caches[rev_id]
            if rev_id % 5 != 0:
                values = dependencies.solve(dependents,
                                            context={len_func: lambda: len},
                                            cache=cache)
                yield None, list(values)
            else:
                yield RuntimeError("extract"), None

    def fake_solve(dependents, cache=None):
        cache = cache or {}
        cache.update({len_func: len, literal_fake: "fake"})
        return dependencies.solve(dependents, cache=cache)

    extractor = FakeExtractor(fake_extract, fake_solve, None)

    FakeScorerModel = namedtuple("FakeScorerModel",
                                 ['score', 'version', 'language', 'features'])
    scorer_model = FakeScorerModel(lambda fvs: {"prediction": "generated"},
                                   "1", None, [characters, is_fake])

    scoring_context = ScoringContext("fakewiki", {"fake": scorer_model},
                                     extractor)

    rev_ids = [1, 2, 3, 4, 5]
    root_ds_caches = scoring_context.extract_roots("fake", rev_ids)
    eq_(len(root_ds_caches), 5)
    eq_(root_ds_caches[1][1][fake_data], "fake")
    assert root_ds_caches[5][0] is not None

    score, feature_vals = scoring_context.score("fake", {
        characters: 10,
        is_fake: False
    })
    eq_(score['prediction'], "generated")
Ejemplo n.º 6
0
def test_max_vectors():
    my_list = Datasource("my_list")
    my_max = aggregators.max(my_list, vector=True)
    cache = {my_list: [[1, 2, 3], [4, 5, 6]]}
    assert all(a == b for a, b in zip(solve(my_max, cache=cache), [4, 5, 6]))
    cache = {my_list: [[]]}
    assert solve(my_max, cache=cache) == [0]
    cache = {my_list: [None]}
    assert solve(my_max, cache=cache) == [0]

    assert pickle.loads(pickle.dumps(my_max)) == my_max
Ejemplo n.º 7
0
def test_trim():

    d1 = Datasource("derp1")
    f1 = Feature("foobar1", returns=int)
    f2 = Feature("foobar2", returns=int, depends_on=[d1])
    c = Constant(value=5)
    fv = FeatureVector("foobar3", returns=int, depends_on=[c])

    assert list(trim(f1)) == [f1]
    assert list(trim([f1, f2, fv])) == [f1, f2, fv]
    assert (list(trim(log(max(f1 - f2, 1)))) == [f1, f2])
Ejemplo n.º 8
0
def test_len():
    my_list = Datasource("my_list")
    my_len = aggregators.len(my_list)
    cache = {my_list: [1, 2, 3, 4]}
    assert solve(my_len, cache=cache) == 4
    cache = {my_list: []}
    assert solve(my_len, cache=cache) == 0
    cache = {my_list: None}
    assert solve(my_len, cache=cache) == 0

    assert pickle.loads(pickle.dumps(my_len)) == my_len
Ejemplo n.º 9
0
def test_sum_vectors():
    my_list = Datasource("my_list")
    my_sum = aggregators.sum(my_list, vector=True)
    cache = {my_list: [[1, 2, 3], [4, 5, 6]]}
    assert all(a == b for a, b in zip(solve(my_sum, cache=cache), [5, 7, 9]))
    cache = {my_list: [[]]}
    assert solve(my_sum, cache=cache) == [0]
    cache = {my_list: [None]}
    assert solve(my_sum, cache=cache) == [0]
    assert str(my_sum) == "feature_vector.sum(<datasource.my_list>)"

    assert pickle.loads(pickle.dumps(my_sum)) == my_sum
Ejemplo n.º 10
0
def test_sum():
    my_list = Datasource("my_list")
    my_sum = aggregators.sum(my_list)
    cache = {my_list: [1, 2, 3, 4]}
    assert solve(my_sum, cache=cache) == 10
    cache = {my_list: []}
    assert solve(my_sum, cache=cache) == 0
    cache = {my_list: None}
    assert solve(my_sum, cache=cache) == 0
    assert str(my_sum) == "feature.sum(<datasource.my_list>)"

    assert pickle.loads(pickle.dumps(my_sum)) == my_sum
Ejemplo n.º 11
0
def test_key():
    my_dict = Datasource("my_dict")
    foo = key('foo', my_dict)
    assert solve(foo, cache={my_dict: {'foo': "bar"}}) == 'bar'
    assert repr(foo) == "<datasource.my_dict['foo']>"

    bar = key('bar', my_dict, apply=or_none(int))
    assert solve(bar, cache={my_dict: {'bar': None}}) is None
    assert solve(bar, cache={my_dict: {'bar': "1"}}) == 1

    foobar = key(['foo', 'bar'], my_dict)
    assert solve(foobar, cache={my_dict: {'bar': 1}}) is None
    assert solve(foobar, cache={my_dict: {'foo': {'bar': 1}}}) == 1
    assert repr(foobar) == "<datasource.my_dict[['foo', 'bar']]>"

    assert pickle.loads(pickle.dumps(foo)) == foo
    assert pickle.loads(pickle.dumps(bar)) == bar
    assert pickle.loads(pickle.dumps(foobar)) == foobar
Ejemplo n.º 12
0
def test_offline_extractor():
    last_two_in_id = Datasource("last_two_in_id",
                                get_last_two,
                                depends_on=[revision_oriented.revision.id])

    extractor = OfflineExtractor()

    assert extractor.extract(345678, last_two_in_id) == 78

    assert (list(extractor.extract([345678, 4634800],
                                   last_two_in_id)) == [(None, 78), (None, 0)])

    extraction_profile = {}
    list(
        extractor.extract([345678, 4634800],
                          last_two_in_id,
                          profile=extraction_profile))
    assert len(extraction_profile) == 1
    assert len(extraction_profile[last_two_in_id]) == 2
Ejemplo n.º 13
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.sentences_added_removed = Datasource(
            self._name + ".sentences_added_removed", set_diff,
            depends_on=[self.revision.sentences,
                        self.revision.parent.sentences]
        )

        self.sentences_added = indexable.index(
            0, self.sentences_added_removed,
            name=self._name + ".sentences_added"
        )
        """
        A set of sentences that were added in this edit
        """

        self.sentences_removed = indexable.index(
            1, self.sentences_added_removed,
            name=self._name + ".sentences_removed"
        )
        """
Ejemplo n.º 14
0
wikidata_kvs = vectorizers.word2vec.load_gensim_kv(
    filename="wikidata-20200501-learned_vectors.50_cell.10k.kv", mmap="r")


def process_claims_to_words(claims):
    words = []
    for pid, value in claims:
        words.append(pid)
        if QID_RE.match(value) is not None:
            words.append(value)
    return words


def vectorize_words(words):
    return vectorizers.word2vec.vectorize_words(wikidata_kvs, words)


claim_words = Datasource("wikidata.revision.claim_words",
                         process_claims_to_words,
                         depends_on=[wikibase.revision.datasources.claims])

revision_claim_words_vectors = vectorizers.word2vec(
    claim_words, vectorize_words, name="revision.text.wikidata_vectors")

w2v = aggregators.mean(revision_claim_words_vectors,
                       vector=True,
                       name="revision.text.wikidata_vectors_mean")

articletopic = [w2v]
Ejemplo n.º 15
0
    def __init__(self, name, revision_datasources):
        super().__init__(name, revision_datasources)

        self.tokens = tokenized(revision_datasources.text)
        """
        A list of all tokens
        """

        self.paragraphs_sentences_and_whitespace = Datasource(
            self._name + ".paragraphs_sentences_and_whitespace",
            paragraphs_sentences_and_whitespace.segment,
            depends_on=[self.tokens])
        """
        A list of paragraphs, sentences, and whitespaces as segments.  See
        :class:`deltas.segmenters.Segment` and
        :class:`deltas.segmenters.MatchableSegment`.
        """

        self.token_frequency = frequencies.table(self.tokens,
                                                 name=self._name +
                                                 ".token_frequency")
        """
        A frequency table of all tokens.
        """

        self.numbers = self.tokens_in_types({'number'},
                                            name=self._name + ".numbers")
        """
        A list of numeric tokens
        """

        self.number_frequency = frequencies.table(self.numbers,
                                                  name=self._name +
                                                  ".number_frequency")
        """
        A frequency table of number tokens.
        """

        self.whitespaces = self.tokens_in_types({'whitespace'},
                                                name=self._name +
                                                ".whitespaces")
        """
        A list of whitespace tokens
        """

        self.whitespace_frequency = frequencies.table(self.whitespaces,
                                                      name=self._name +
                                                      ".whitespace_frequency")
        """
        A frequency table of whichspace tokens.
        """

        self.markups = self.tokens_in_types(
            {
                'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close',
                'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close',
                'curly_open', 'curly_close', 'bold', 'italics', 'equals'
            },
            name=self._name + ".markups")
        """
        A list of markup tokens
        """

        self.markup_frequency = frequencies.table(self.markups,
                                                  name=self._name +
                                                  ".markup_frequency")
        """
        A frequency table of markup tokens.
        """

        self.cjks = self.tokens_in_types({'cjk'}, name=self._name + ".cjks")
        """
        A list of Chinese/Japanese/Korean tokens
        """

        self.cjk_frequency = frequencies.table(self.cjks,
                                               name=self._name +
                                               ".cjk_frequency")
        """
        A frequency table of cjk tokens.
        """

        self.entities = self.tokens_in_types({'entity'},
                                             name=self._name + ".entities")
        """
        A list of HTML entity tokens
        """

        self.entity_frequency = frequencies.table(self.entities,
                                                  name=self._name +
                                                  ".entity_frequency")
        """
        A frequency table of entity tokens.
        """

        self.urls = self.tokens_in_types({'url'}, name=self._name + ".urls")
        """
        A list of URL tokens
        """

        self.url_frequency = frequencies.table(self.urls,
                                               name=self._name +
                                               ".url_frequency")
        """
        A frequency table of url tokens.
        """

        self.words = self.tokens_in_types({'word'}, name=self._name + ".words")
        """
        A list of word tokens
        """

        self.word_frequency = frequencies.table(mappers.lower_case(self.words),
                                                name=self._name +
                                                ".word_frequency")
        """
        A frequency table of lower-cased word tokens.
        """

        self.uppercase_words = filters.filter(is_uppercase_word,
                                              self.words,
                                              name=self._name +
                                              ".uppercase_words")
        """
        A list of uppercase word tokens that are at least two
        characters long.
        """

        self.uppercase_word_frequency = frequencies.table(
            self.uppercase_words,
            name=self._name + ".uppercase_word_frequency")
        """
        A frequency table of uppercase word tokens that are at least two
        characters long.
        """

        self.punctuations = self.tokens_in_types(
            {
                'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon',
                'japan_punct'
            },
            name=self._name + ".punctuations")
        """
        A list of punctuation tokens
        """

        self.punctuation_frequency = frequencies.table(
            self.punctuations, name=self._name + ".punctuation_frequency")
        """
        A frequency table of punctuation tokens.
        """

        self.breaks = self.tokens_in_types({'break'},
                                           name=self._name + ".breaks")
        """
        A list of break tokens
        """

        self.break_frequency = frequencies.table(self.breaks,
                                                 name=self._name +
                                                 ".break_frequency")
        """
Ejemplo n.º 16
0
def test_missing_key():
    with raises(RuntimeError):
        my_dict = Datasource("my_dict")
        foobar = key(['foo', 'bar'], my_dict, if_missing=(RuntimeError))
        assert solve(foobar, cache={my_dict: {'bar': 1}}) is None
Ejemplo n.º 17
0
def test_key_exists():
    my_dict = Datasource("my_dict")
    foo_exists = key_exists('foo', my_dict)
    assert solve(foo_exists, cache={my_dict: {'foo': "bar"}}) is True
    assert solve(foo_exists, cache={my_dict: {'baz': "bar"}}) is False
    assert pickle.loads(pickle.dumps(foo_exists)) == foo_exists
Ejemplo n.º 18
0
from revscoring.datasources import Datasource

id = Datasource("page.id")

wikiproject_title = Datasource("page.wikiproject_title")

stats = Datasource("page.stats")
Ejemplo n.º 19
0
    def __init__(self, name, revision_datasources):
        super().__init__(name, revision_datasources)

        self.wikicode = Datasource(self._name + ".wikicode",
                                   _process_wikicode,
                                   depends_on=[revision_datasources.text])
        """
        A :class:`mwparserfromhell.wikicode.Wikicode` abstract syntax
        tree representing the structure of the page.
        """

        self.node_class_map = Datasource(self._name + ".node_class_map",
                                         _process_node_class_map,
                                         depends_on=[self.wikicode])
        """
        A map of mwparserfromhell.wikicode.<class> to lists of nodes of
        that type.
        """

        self.content = execute_method("strip_code",
                                      self.wikicode,
                                      name=self._name + ".content")
        """
        The viewable content (no markup or templates) of the revision.
        """

        self.headings = get_key(mwparserfromhell.nodes.Heading,
                                self.node_class_map,
                                default=[],
                                name=self._name + ".headings")
        """
        A list of :class:`mwparserfromhell.nodes.heading.Heading`'s
        """

        self.heading_titles = mappers.map(_extract_heading_title,
                                          self.headings,
                                          name=self._name + ".heading_titles")
        """
        A list of heading titles
        """

        self.external_links = get_key(mwparserfromhell.nodes.ExternalLink,
                                      self.node_class_map,
                                      default=[],
                                      name=self._name + ".external_links")
        """
        A list of :class:`mwparserfromhell.nodes.heading.ExternalLink`'s
        """

        self.external_link_urls = mappers.map(_extract_external_link_url,
                                              self.external_links,
                                              name=self._name +
                                              ".external_link_url")
        """
        A list of external link urls
        """

        self.wikilinks = get_key(mwparserfromhell.nodes.Wikilink,
                                 self.node_class_map,
                                 default=[],
                                 name=self._name + ".wikilinks")
        """
        A list of :class:`mwparserfromhell.nodes.heading.Wikilink`'s
        """

        self.wikilink_titles = mappers.map(_extract_wikilink_title,
                                           self.wikilinks,
                                           name=self._name +
                                           ".wikilink_titles")
        """
        Returns a list of string titles of internal links (aka "targets")
        """

        self.tags = get_key(mwparserfromhell.nodes.Tag,
                            self.node_class_map,
                            default=[],
                            name=self._name + ".tags")
        """
        A list of :class:`mwparserfromhell.nodes.heading.Tag`'s
        """

        self.tag_names = mappers.map(_extract_tag_name,
                                     self.tags,
                                     name=self._name + ".tag_names")
        """
        Returns a list of html tag names present in the content of the revision
        """

        self.tags_str = mappers.map(str,
                                    self.tags,
                                    name=self._name + ".tags_str")
        """
        Returns a list of tags present in the content of the revision as strings
        """

        self.templates = get_key(mwparserfromhell.nodes.Template,
                                 self.node_class_map,
                                 default=[],
                                 name=self._name + ".templates")
        """
        A list of :class:`mwparserfromhell.nodes.heading.Templates`'s
        """

        self.template_names = mappers.map(_extract_template_name,
                                          self.templates,
                                          name=self._name + ".template_names")
        """
        Returns a list of template names present in the content of the revision
        """

        self.templates_str = mappers.map(str,
                                         self.templates,
                                         name=self._name + ".templates_str")
        """
        Returns a list of templates present in the content of the revision as strings
        """
        self.sections = Datasource(self._name + ".section",
                                   _extract_sections,
                                   depends_on=[self.wikicode])
        """
Ejemplo n.º 20
0
    def __init__(self, name, revision_datasources):
        super().__init__(name)

        self.revision_entity = revision_datasources.entity
        self.parent_entity = revision_datasources.parent.entity

        # sitelinks
        self.sitelinks_diff = Datasource(
            name + ".sitelinks_diff",
            diff_dicts,
            depends_on=[
                revision_datasources.parent.sitelinks,
                revision_datasources.sitelinks
            ])
        self.sitelinks_added, self.sitelinks_removed, self.sitelinks_changed =\
            diff_parts(name + ".sitelinks", self.sitelinks_diff)

        # labels
        self.labels_diff = Datasource(name + ".labels_diff",
                                      diff_dicts,
                                      depends_on=[
                                          revision_datasources.parent.labels,
                                          revision_datasources.labels
                                      ])
        self.labels_added, self.labels_removed, self.labels_changed = \
            diff_parts(name + ".labels", self.labels_diff)

        # aliases
        self.aliases_diff = Datasource(name + ".aliases_diff",
                                       diff_dicts,
                                       depends_on=[
                                           revision_datasources.parent.aliases,
                                           revision_datasources.aliases
                                       ])
        self.aliases_added, self.aliases_removed, self.aliases_changed = \
            diff_parts(name + ".aliases", self.aliases_diff)

        # descriptions
        self.descriptions_diff = Datasource(
            name + ".descriptions_diff",
            diff_dicts,
            depends_on=[
                revision_datasources.parent.descriptions,
                revision_datasources.descriptions
            ])
        (self.descriptions_added, self.descriptions_removed,
         self.descriptions_changed) = \
            diff_parts(name + ".descriptions", self.descriptions_diff)

        # properties
        self.properties_diff = Datasource(
            name + ".properties_diff",
            diff_dicts,
            depends_on=[
                revision_datasources.parent.properties,
                revision_datasources.properties
            ])
        (self.properties_added, self.properties_removed,
         self.properties_changed) = \
            diff_parts(name + ".properties", self.properties_diff)

        self.statements_added = Datasource(name + ".statements_added",
                                           _process_statements_added,
                                           depends_on=[
                                               self.properties_diff,
                                               self.parent_entity,
                                               self.revision_entity
                                           ])
        self.claims_added = Datasource(  # Backwards compatible
            name + ".claims_added",
            _identity,
            depends_on=[self.statements_added])
        self.statements_removed = Datasource(name + ".statements_removed",
                                             _process_statements_removed,
                                             depends_on=[
                                                 self.properties_diff,
                                                 self.parent_entity,
                                                 self.revision_entity
                                             ])
        self.claims_removed = Datasource(  # Backwards compatible
            name + ".claims_removed",
            _identity,
            depends_on=[self.statements_removed])
        self.statements_changed = Datasource(name + ".statements_changed",
                                             _process_statements_changed,
                                             depends_on=[
                                                 self.properties_diff,
                                                 self.parent_entity,
                                                 self.revision_entity
                                             ])
        self.claims_changed = Datasource(  # Backwards compatible
            name + ".claims_changed",
            _identity,
            depends_on=[self.statements_changed])
        self.sources_added = Datasource(name + ".sources_added",
                                        _process_sources_added,
                                        depends_on=[self.claims_changed])
        self.sources_removed = Datasource(name + ".sources_removed",
                                          _process_sources_removed,
                                          depends_on=[self.claims_changed])
        self.qualifiers_added = Datasource(name + ".qualifiers_added",
                                           _process_qualifiers_added,
                                           depends_on=[self.claims_changed])
        self.qualifiers_removed = Datasource(name + ".qualifiers_removed",
                                             _process_qualifiers_removed,
                                             depends_on=[self.claims_changed])

        # badges
        self.badges_diff = Datasource(name + ".badges_diff",
                                      diff_dicts,
                                      depends_on=[
                                          revision_datasources.parent.badges,
                                          revision_datasources.badges
                                      ])
        self.badges_added, self.badges_removed, self.badges_changed = \
            diff_parts(name + ".badges", self.badges_diff)
Ejemplo n.º 21
0
    def __init__(self, name, revision_datasources):
        super().__init__(name)

        self.entity_doc = Datasource(name + ".entity_doc",
                                     _process_entity_doc,
                                     depends_on=[revision_datasources.text])
        """
        A JSONable `dict` of content for a Wikibase content.
        """

        self.entity = Datasource(name + ".entity",
                                 _process_entity,
                                 depends_on=[self.entity_doc])
        """
        A `~mwbase.Entity` for the Wikibase content
        """

        self.sitelinks = Datasource(name + ".sitelinks",
                                    _process_sitelinks,
                                    depends_on=[self.entity])
        """
        A `dict` of wiki/sitelink pairs in the revision
        """

        self.labels = Datasource(name + ".labels",
                                 _process_labels,
                                 depends_on=[self.entity])
        """
        A `dict` of lang/label pairs in the revision
        """

        self.aliases = Datasource(name + ".aliases",
                                  _process_aliases,
                                  depends_on=[self.entity])
        """
        A `set` of unique aliases in the revision
        """

        self.descriptions = Datasource(name + ".descriptions",
                                       _process_descriptions,
                                       depends_on=[self.entity])
        """
        A `dict` of lang/description pairs in the revision
        """

        self.properties = Datasource(name + ".properties",
                                     _process_properties,
                                     depends_on=[self.entity])
        """
        A `set` of properties in the revision
        """

        self.claims = Datasource(name + ".claim",
                                 _process_claims,
                                 depends_on=[self.entity])
        """
        A `set` of unique claims in the revision
        """

        self.sources = Datasource(name + ".sources",
                                  _process_sources,
                                  depends_on=[self.entity])
        """
        A `set` of unique sources in the revision
        """

        self.reference_claims = Datasource(name + ".reference_claims",
                                           _process_ref_claims,
                                           depends_on=[self.entity])
        """
        A `set` of unique reference claims in the revision
        """

        self.qualifiers = Datasource(name + ".qualifiers",
                                     _process_qualifiers,
                                     depends_on=[self.entity])
        """
        A `set` of unique qualifiers in the revision
        """

        self.badges = Datasource(name + ".badges",
                                 _process_badges,
                                 depends_on=[self.entity])
        """
        A `set` of unique badges in the revision
        """

        if hasattr(revision_datasources, "parent") and \
           hasattr(revision_datasources.parent, "text"):
            self.parent = Revision(name + ".parent",
                                   revision_datasources.parent)

            if hasattr(revision_datasources, "diff"):
                self.diff = Diff(name + ".diff", self)
Ejemplo n.º 22
0
def test_scoring_context():
    from revscoring.datasources import Datasource
    from revscoring.dependencies import Dependent
    from revscoring.features import Feature

    fake_data = Datasource("fake_data", lambda: "fake")
    len_func = Dependent("len_func")
    literal_fake = Dependent("literal_fake")
    characters = Feature("characters",
                         lambda word, len: len(word),
                         returns=int,
                         depends_on=[fake_data, len_func])
    is_fake = Feature("is_fake",
                      lambda word, fake: word == fake,
                      returns=bool,
                      depends_on=[fake_data, literal_fake])

    FakeExtractor = namedtuple("Extractor", ['extract', 'solve', 'language'])

    def fake_extract(rev_ids, dependents, caches=None):
        caches = caches if caches is not None else {}
        for rev_id in rev_ids:
            if rev_id % 5 != 0:
                cache = caches.get(rev_id, {})
                values = dependencies.solve(dependents,
                                            context={len_func: lambda: len},
                                            cache=cache)
                values = list(values)
                caches[rev_id] = cache
                yield None, values
            else:
                yield RuntimeError("extract"), None

    def fake_solve(dependents, cache=None):
        cache = cache if cache is not None else {}
        cache.update({len_func: len, literal_fake: "fake"})
        return dependencies.solve(dependents, cache=cache)

    extractor = FakeExtractor(fake_extract, fake_solve, None)

    FakeScorerModel = namedtuple("FakeScorerModel",
                                 ['score', 'version', 'language', 'features'])
    scorer_model = FakeScorerModel(lambda fvs: {"prediction": "generated"},
                                   "1", None, [characters, is_fake])

    scoring_context = ScoringContext("fakewiki", {"fake": scorer_model},
                                     extractor)

    rev_ids = [1, 2, 3, 4, 5]
    root_ds_caches, errors = scoring_context.extract_root_dependency_caches(
        ["fake"], rev_ids)
    print(root_ds_caches)
    print(errors)
    assert len(root_ds_caches) == 4
    assert len(errors) == 1
    assert root_ds_caches[1][fake_data] == "fake"
    assert 5 in errors

    score = scoring_context.process_model_scores(["fake"], {
        characters: 10,
        is_fake: False
    })
    assert score['fake']['score']['prediction'] == "generated"
Ejemplo n.º 23
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.operations = Datasource(
            self._name + ".operations",
            _process_operations,
            depends_on=[
                self.revision.parent.paragraphs_sentences_and_whitespace,
                self.revision.paragraphs_sentences_and_whitespace,
                self.revision.parent.tokens, self.revision.tokens
            ])
        """
        Returns a tuple that describes the difference between the parent
        revision text and the current revision's text.

        The tuple contains three fields:

        * operations: `list` of :class:`deltas.Operation`
        * A tokens: `list` of `str`
        * B tokens: `list` of `str`
        """

        self.segments_added = Datasource(self._name + ".segments_added",
                                         _process_segments_added,
                                         depends_on=[self.operations])
        """
        Returns a list of all contiguous segments of tokens added in this
        revision.
        """

        self.segments_removed = Datasource(self._name + ".segments_removed",
                                           _process_segments_removed,
                                           depends_on=[self.operations])
        """
        Returns a list of all contiguous segments of tokens removed in this
        revision.
        """

        self.tokens_added = Datasource(self._name + ".tokens_added",
                                       _process_tokens_added,
                                       depends_on=[self.operations])
        """
        Constructs a :class:`revscoring.Datasource` that returns a list of all
        tokens added in this revision.
        """

        self.tokens_removed = Datasource(self._name + ".tokens_removed",
                                         _process_tokens_removed,
                                         depends_on=[self.operations])
        """
        Constructs a :class:`revscoring.Datasource` that returns a list of all
        tokens removed in this revision.
        """

        self.numbers_added = self.tokens_added_in_types({'number'},
                                                        name=self._name +
                                                        ".numbers_added")
        """
        A list of numeric tokens added in the edit
        """

        self.numbers_removed = self.tokens_removed_in_types({'number'},
                                                            name=self._name +
                                                            ".numbers_removed")
        """
        A list of numeric tokens removed in the edit
        """

        self.whitespaces_added = self.tokens_added_in_types(
            {'whitespace'}, name=self._name + ".whitespaces_added")
        """
        A list of whitespace tokens added in the edit
        """

        self.whitespaces_removed = self.tokens_removed_in_types(
            {'whitespace'}, name=self._name + ".whitespaces_removed")
        """
        A list of whitespace tokens removed in the edit
        """

        self.markups_added = self.tokens_added_in_types(
            {
                'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close',
                'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close',
                'curly_open', 'curly_close', 'bold', 'italics', 'equals'
            },
            name=self._name + ".markups_added")
        """
        A list of markup tokens added in the edit
        """

        self.markups_removed = self.tokens_removed_in_types(
            {
                'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close',
                'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close',
                'curly_open', 'curly_close', 'bold', 'italics', 'equals'
            },
            name=self._name + ".markups_removed")
        """
        A list of markup tokens removed in the edit
        """

        self.cjks_added = self.tokens_added_in_types({'cjk'},
                                                     name=self._name +
                                                     ".cjks_added")
        """
        A list of Chinese/Japanese/Korean tokens added in the edit
        """

        self.cjks_removed = self.tokens_removed_in_types({'cjk'},
                                                         name=self._name +
                                                         ".cjks_removed")
        """
        A list of Chinese/Japanese/Korean tokens removed in the edit
        """

        self.entities_added = self.tokens_added_in_types({'entity'},
                                                         name=self._name +
                                                         ".entities_added")
        """
        A list of HTML entity tokens added in the edit
        """

        self.entities_removed = self.tokens_removed_in_types(
            {'entity'}, name=self._name + ".entities_removed")
        """
        A list of HTML entity tokens removed in the edit
        """

        self.urls_added = self.tokens_added_in_types({'url'},
                                                     name=self._name +
                                                     ".urls_added")
        """
        A list of URL tokens rempved in the edit
        """

        self.urls_removed = self.tokens_removed_in_types({'url'},
                                                         name=self._name +
                                                         ".urls_removed")
        """
        A list of URL tokens added in the edit
        """

        self.words_added = self.tokens_added_in_types({'word'},
                                                      name=self._name +
                                                      ".words_added")
        """
        A list of word tokens added in the edit
        """

        self.words_removed = self.tokens_removed_in_types({'word'},
                                                          name=self._name +
                                                          ".words_removed")
        """
        A list of word tokens removed in the edit
        """

        self.uppercase_words_added = filters.filter(is_uppercase_word,
                                                    self.words_added,
                                                    name=self._name +
                                                    ".uppercase_words_added")
        """
        A list of fully UPPERCASE word tokens added in the edit
        """

        self.uppercase_words_removed = filters.filter(
            is_uppercase_word,
            self.words_removed,
            name=self._name + ".uppercase_words_removed")
        """
        A list of fully UPPERCASE word tokens removed in the edit
        """

        self.punctuations_added = self.tokens_added_in_types(
            {
                'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon',
                'japan_punct'
            },
            name=self._name + ".punctuations_added")
        """
        A list of punctuation tokens added in the edit
        """

        self.punctuations_removed = self.tokens_removed_in_types(
            {
                'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon',
                'japan_punct'
            },
            name=self._name + ".punctuations_removed")
        """
        A list of punctuation tokens removed in the edit
        """

        self.breaks_added = self.tokens_added_in_types({'break'},
                                                       name=self._name +
                                                       ".breaks_added")
        """
        A list of break tokens added in the edit
        """

        self.breaks_removed = self.tokens_removed_in_types({'break'},
                                                           name=self._name +
                                                           ".breaks_removed")
        """
Ejemplo n.º 24
0
import pickle

from revscoring.datasources import Datasource
from revscoring.dependencies import solve
from revscoring.features.meta import vectorizers

my_dict = Datasource("my_dict")


class KeysDict(Datasource):
    def __init__(self, name, keys):
        super().__init__(name)
        self._keys = keys

    def keys(self):
        return self._keys


my_keys_dict = KeysDict("my_keys_dict", ["a", "b", "c"])


def test_vectorize():
    my_vector = vectorizers.vectorize(my_dict, ["a", "b", "c"], returns=int)

    assert (solve(my_vector, cache={my_dict: {"a": 5}}) == [5, 0, 0])
    assert (solve(my_vector, cache={my_dict: {"d": 5}}) == [0, 0, 0])
    assert (solve(my_vector, cache={my_dict: {
        "a": 1,
        "b": 2,
        "c": 3
    }}) == [1, 2, 3])
Ejemplo n.º 25
0
def get_polarity_score(non_stop_tokens):
    """
    Gets the positive and negative polarity of the document using SentiWordnet
    takes the most common sense of the word for efficiency
    """
    pos, neg = 0.0, 0.0
    for t in non_stop_tokens:
        synsets = list(swn.senti_synsets(t))
        if synsets:
            pos += synsets[0].pos_score()
            neg += synsets[0].neg_score()
    return [pos, neg]


sentiment_score = Datasource("english.sentiment.revision.polarity_score",
                             get_polarity_score,
                             depends_on=[english.stopwords.revision.datasources.non_stopwords])  # noqa: E501


def get_positive_score(senti_score):
    return senti_score[0]


def get_negative_score(senti_score):
    return senti_score[1]


positive_polarity = Feature(
    "english.sentiment.revision.positive_polarity",
    get_positive_score,
    depends_on=[sentiment_score],