Ejemplo n.º 1
0
def test_len_vectors():
    my_list = Datasource("my_list")
    my_len = aggregators.len(my_list, vector=True)
    cache = {my_list: [[1, 2, 3], [4, 5, 6]]}
    assert all(a == b for a, b in zip(solve(my_len, cache=cache), [2, 2, 2]))
    cache = {my_list: [[]]}
    assert solve(my_len, cache=cache) == [0]
    cache = {my_list: [None]}
    assert solve(my_len, cache=cache) == [0]

    assert pickle.loads(pickle.dumps(my_len)) == my_len
Ejemplo n.º 2
0
def test_len():
    my_list = Datasource("my_list")
    my_len = aggregators.len(my_list)
    cache = {my_list: [1, 2, 3, 4]}
    assert solve(my_len, cache=cache) == 4
    cache = {my_list: []}
    assert solve(my_len, cache=cache) == 0
    cache = {my_list: None}
    assert solve(my_len, cache=cache) == 0

    assert pickle.loads(pickle.dumps(my_len)) == my_len
Ejemplo n.º 3
0

def vectorize_words(words):
    return vectorizers.word2vec.vectorize_words(enwiki_kvs, words)


revision_text_vectors = vectorizers.word2vec(mappers.lower_case(
    wikitext.revision.datasources.words),
                                             vectorize_words,
                                             name="revision.text.en_vectors")

w2v = aggregators.mean(revision_text_vectors,
                       vector=True,
                       name="revision.text.en_vectors_mean")

female_pronouns = wikitext.revision.datasources.tokens_matching(
    r"\b(she|her|hers)\b")
male_pronouns = wikitext.revision.datasources.tokens_matching(
    r"\b(he|him|his)\b")
female_pronouns_count = aggregators.len(female_pronouns)
male_pronouns_count = aggregators.len(male_pronouns)

pronoun_features = [
    female_pronouns_count, male_pronouns_count,
    female_pronouns_count + male_pronouns_count, female_pronouns_count /
    modifiers.max(female_pronouns_count + male_pronouns_count, 1)
]

drafttopic = [w2v] + pronoun_features
articletopic = drafttopic
Ejemplo n.º 4
0
important_label_translations = Datasource(
    name + ".revision.important_label_translations",
    _process_important_label_translations,
    depends_on=[wikibase_.revision.datasources.labels])


def _process_important_description_translations(item_descriptions):
    return (item_descriptions.keys() & IMPORTANT_LANG_CODES)


important_description_translations = Datasource(
    name + ".revision.important_description_translations",
    _process_important_description_translations,
    depends_on=[wikibase_.revision.datasources.descriptions])

source_claims_count = aggregators.len(source_claims)
"`int` : A count of all sources in the revision"

wikimedia_sources_count = aggregators.len(wikimedia_sources)
"`int` : A count of all sources which come from Wikimedia projects"

external_sources_count = source_claims_count - wikimedia_sources_count
"`int` : A count of all sources which do not come from Wikimedia projects"

unique_sources_count = aggregators.len(unique_sources)
"`int` : A count of unique sources in the revision"

# Status
is_human = wikibase_.revision.has_property_value(properties.INSTANCE_OF,
                                                 items.HUMAN,
                                                 name=name + '.is_human')
Ejemplo n.º 5
0
    name + ".revision.important_label_translations",
    _process_important_label_translations,
    depends_on=[wikibase_.revision.datasources.labels])


def _process_important_description_translations(item_descriptions):
    return (item_descriptions.keys() & IMPORTANT_LANG_CODES)


important_description_translations = Datasource(
    name + ".revision.important_description_translations",
    _process_important_description_translations,
    depends_on=[wikibase_.revision.datasources.descriptions])


references_count = aggregators.len(references)
"`int` : A count of all sources in the revision"

wikimedia_references_count = aggregators.len(wikimedia_references)
"`int` : A count of all sources which come from Wikimedia projects"

external_references_count = references_count - wikimedia_references_count
"`int` : A count of all sources which do not come from Wikimedia projects"

unique_references_count = aggregators.len(unique_references)
"`int` : A count of unique sources in the revision"

# Status
is_human = wikibase_.revision.has_property_value(
    properties.INSTANCE_OF, items.HUMAN, name=name + '.revision.is_human')
has_birthday = wikibase_.revision.has_property(
Ejemplo n.º 6
0
# TODO: This ends up being case insensitive even though taht doesn't
#       make any sense.
weird_regexes = [
    # capital letters in the middle of a word
    r'\w[^\WA-Z\u00c0-\u00dd]*[A-Z\u00c0-\u00dd][^\WA-Z\u00c0-\u00dd]+',
    # non-text chars in the middle of a word
    r'\w+[^\w\s]\w+',
    # not actually french quotes e.g. "<<" and ">>" as opposed to « or »
    r'<<|>>'
]
weird_word_things = RegexMatches(
    "wikitext.revision.weird_word_things", weird_regexes)

# proportion of brackets and semi-colons
nonsense_markup = aggregators.len(
    wikitext.revision.datasources.tokens_matching(r"[\{\}\[\]\|\;\\\/\:]"),
    name="wikitext.revision.nonsense_markup")

# <ref name="derp">...</ref> (in another page
# <ref following="derp" name="otherderp">...</ref>)
# TODO

# <big>,<small>,<center>,<div>,<span>,<b>,<i>,<poem>,<section>,''',''
good_tags = wikitext.revision.tag_names_matching(
    r"big|small|center|div|span|b|i|poem|section",
    name="wikitext.revision.good_tags")
expected_markup = aggregators.len(
    wikitext.revision.datasources.tokens_matching(r"'''|''"),
    name="wiktext.revision.expected_markup")

page = [
Ejemplo n.º 7
0

def _process_non_external_id_statements(entity):
    return [
        statement for pid, statements in entity.properties.items()
        if pid in property_datatypes.NONEXTERNAL_IDENTIFIERS
        for statement in statements
    ]


non_external_id_statements = Datasource(
    name + ".revision.non_external_id_statements",
    _process_non_external_id_statements,
    depends_on=[wikibase_.revision.datasources.entity])

references_count = aggregators.len(references)
"`int` : A count of all sources in the revision"

wikimedia_references_count = aggregators.len(wikimedia_references)
"`int` : A count of all sources which come from Wikimedia projects"

external_references_count = references_count - wikimedia_references_count
"`int` : A count of all sources which do not come from Wikimedia projects"

unique_references_count = aggregators.len(unique_references)
"`int` : A count of unique sources in the revision"

non_external_id_statements_count = aggregators.len(non_external_id_statements)
"`int` : A count of all statements that are not external identifiers"

Ejemplo n.º 8
0
def test_external_identifiers(q7251):
    assert solve(aggregators.len(wikidatawiki.external_identifiers),
                 cache={entity: q7251}) == 79
Ejemplo n.º 9
0
    name + ".revision.important_label_translations",
    _process_important_label_translations,
    depends_on=[wikibase_.revision.datasources.labels])


def _process_important_description_translations(item_descriptions):
    return (item_descriptions.keys() & IMPORTANT_LANG_CODES)


important_description_translations = Datasource(
    name + ".revision.important_description_translations",
    _process_important_description_translations,
    depends_on=[wikibase_.revision.datasources.descriptions])


source_claims_count = aggregators.len(source_claims)
"`int` : A count of all sources in the revision"

wikimedia_sources_count = aggregators.len(wikimedia_sources)
"`int` : A count of all sources which come from Wikimedia projects"

external_sources_count = source_claims_count - wikimedia_sources_count
"`int` : A count of all sources which do not come from Wikimedia projects"

unique_sources_count = aggregators.len(unique_sources)
"`int` : A count of unique sources in the revision"

# Status
is_human = wikibase_.revision.has_property_value(
    properties.INSTANCE_OF, items.HUMAN, name=name + '.is_human')
has_birthday = wikibase_.revision.has_property(
Ejemplo n.º 10
0
    name + ".revision.important_label_translations",
    _process_important_label_translations,
    depends_on=[wikibase_.revision.datasources.labels])


def _process_important_description_translations(item_descriptions):
    return (item_descriptions.keys() & IMPORTANT_LANG_CODES)


important_description_translations = Datasource(
    name + ".revision.important_description_translations",
    _process_important_description_translations,
    depends_on=[wikibase_.revision.datasources.descriptions])


references_count = aggregators.len(references)
"`int` : A count of all sources in the revision"

wikimedia_references_count = aggregators.len(wikimedia_references)
"`int` : A count of all sources which come from Wikimedia projects"

external_references_count = references_count - wikimedia_references_count
"`int` : A count of all sources which do not come from Wikimedia projects"

unique_references_count = aggregators.len(unique_references)
"`int` : A count of unique sources in the revision"


def _process_item_completeness(current_properties, properties_suggested):
    current_properties = set(current_properties.keys())
Ejemplo n.º 11
0
                                        _process_important_translations_labels,
                                        depends_on=[item_labels_datasource],
                                        returns=float)
"`float` : A ratio of important translations labels in the revision"

important_translations_descriptions = Feature(
    name + ".important_translations_descriptions",
    _process_important_translations_descriptions,
    depends_on=[item_descriptions_datasource],
    returns=float)
"`float` : A ratio of important translations descriptions in the revision"

all_sources_datasource = Datasource(name + ".all_sources",
                                    _process_all_sources,
                                    depends_on=[item])
all_sources = aggregators.len(all_sources_datasource)
"`int` : A count of all sources in the revision"

all_wikimedia_sources_datasource = Datasource(
    name + ".all_wikimedia_sources",
    _process_wikimedia_sources,
    depends_on=[all_sources_datasource])
all_wikimedia_sources = aggregators.len(all_wikimedia_sources_datasource)
"`int` : A count of all sources which come from Wikimedia projects in the revision"

all_external_sources = modifiers.sub(all_sources, all_wikimedia_sources)
"A count of all sources which do not come from Wikimedia projects in the revision"

external_sources_ratio = all_external_sources / modifiers.max(
    wikibase_features.revision.sources, 1)
"A ratio/division between number of external references and number of claims that have references in the revision"