Esempio n. 1
0
def test_sum():
    my_list = Datasource("my_list")
    my_sum = aggregators.sum(my_list)
    cache = {my_list: [1, 2, 3, 4]}
    assert solve(my_sum, cache=cache) == 10
    cache = {my_list: []}
    assert solve(my_sum, cache=cache) == 0
    cache = {my_list: None}
    assert solve(my_sum, cache=cache) == 0
    assert str(my_sum) == "feature.sum(<datasource.my_list>)"

    assert pickle.loads(pickle.dumps(my_sum)) == my_sum
Esempio n. 2
0
def test_sum_vectors():
    my_list = Datasource("my_list")
    my_sum = aggregators.sum(my_list, vector=True)
    cache = {my_list: [[1, 2, 3], [4, 5, 6]]}
    assert all(a == b for a, b in zip(solve(my_sum, cache=cache), [5, 7, 9]))
    cache = {my_list: [[]]}
    assert solve(my_sum, cache=cache) == [0]
    cache = {my_list: [None]}
    assert solve(my_sum, cache=cache) == [0]
    assert str(my_sum) == "feature_vector.sum(<datasource.my_list>)"

    assert pickle.loads(pickle.dumps(my_sum)) == my_sum
Esempio n. 3
0
    r"File|Image\:", name="enwiki.revision.image_links")

# References
revision = Revision(
    "enwiki.revision.revision",
    wikitext.revision.datasources,
)
paragraphs = mappers.map(str,
                         revision.paragraphs_sentences_and_whitespace,
                         name="enwiki.revision.paragraphs")
paragraphs_without_refs = filters.regex_matching(
    r"^(?!\s*$)((?!<ref>)(.|\n))*$",
    paragraphs,
    name="enwiki.revision.paragraphs_without_refs")
paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, paragraphs_without_refs),
    name="enwiki.revision.paragraphs_without_refs_total_length")

local_wiki = [
    image_links,
    image_links / max(wikitext.revision.content_chars, 1),
    category_links,
    category_links / max(wikitext.revision.content_chars, 1),
    cite_templates,
    cite_templates / max(wikitext.revision.content_chars, 1),
    proportion_of_templated_references,
    non_templated_references,
    non_templated_references / max(wikitext.revision.content_chars, 1),
    non_cite_templates,
    non_cite_templates / max(wikitext.revision.content_chars, 1),
    infobox_templates,
Esempio n. 4
0
revision_lang_map = Datasource(
    "revision.lang_map", process_normalized_lang_map,
    depends_on=[ro.revision.text])
parent_lang_map = Datasource(
    "revision.parent.lang_map", process_normalized_lang_map,
    depends_on=[ro.revision.parent.text])
parent_lang_vector = vectorizers.vectorize(
    parent_lang_map, keys=ALL_NORMALIZED_LANGS, returns=float,
    name="revision.parent.lang_vector")
lang_delta = frequencies.delta(parent_lang_map, revision_lang_map)
lang_delta_vector = vectorizers.vectorize(
    lang_delta, keys=ALL_NORMALIZED_LANGS, returns=float,
    name="revision.diff.lang_delta_vector")
lang_delta_sum_diff = aggregators.sum(
    mappers.abs(dicts.values(lang_delta)),
    name="revision.diff.lang_delta_sum_diff")


def process_title_lang_match(title_lang, lang_delta):
    return lang_delta.get(title_lang, 0.0)


parent_lang_match = Feature("revision.parent.lang_match",
                            process_title_lang_match,
                            depends_on=[translation_title_lang,
                                        parent_lang_map],
                            returns=float)
match_lang_delta = Feature("revision.diff.match_lang_delta",
                           process_title_lang_match,
                           depends_on=[translation_title_lang,
Esempio n. 5
0
# References
revision = Revision(
    "glwiki.revision.revision",
    wikitext.revision.datasources,
)
paragraphs = mappers.map(
    str, revision.paragraphs_sentences_and_whitespace,
    name="glwiki.revision.paragraphs"
)
paragraphs_without_refs = filters.regex_matching(
    r"^(?!\s*$)((?!<ref>)(.|\n))*$",
    paragraphs,
    name="glwiki.revision.paragraphs_without_refs"
)
paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, paragraphs_without_refs),
    name="glwiki.revision.paragraphs_without_refs_total_length"
)

local_wiki = [
    image_links,
    image_links / max(wikitext.revision.content_chars, 1),
    category_links,
    category_links / max(wikitext.revision.content_chars, 1),
    cn_templates + 1,
    cn_templates / max(wikitext.revision.content_chars, 1),
    log(paragraphs_without_refs_total_length + 1),
    paragraphs_without_refs_total_length /
    max(wikitext.revision.content_chars, 1),
]

wp10 = wikipedia.article + local_wiki