def headings_by_level(self, level, name=None): """ Constructs a :class:`revscoring.Datasource` that generates a `list` of all headers of a level. """ if name is None: name = "{0}({1})".format(self._name + ".headings_by_level", level) return filters.filter(HeadingOfLevel(level).filter, self.headings, name=name)
def tokens_removed_in_types(self, types, name=None): """ Constructs a :class:`revscoring.Datasource` that represents tokens removed that are within a set of types. """ types = set(types) if name is None: name = "{0}({1})".format(self._name + ".tokens_removed_in_types", types) return filters.filter(TokenIsInTypes(types).filter, self.tokens_removed, name=name)
def tokens_in_types(self, types, name=None): """ Constructs a :class:`revscoring.Datasource` that returns all content tokens that are within a set of types. """ token_is_in_types = TokenIsInTypes(types) if name is None: name = "{0}({1})" \ .format(self._name + ".tokens_in_types", types) return filters.filter(token_is_in_types.filter, self.tokens, name=name)
def __init__(self, name, revision_datasources): super().__init__(name, revision_datasources) self.tokens = tokenized(revision_datasources.text) """ A list of all tokens """ self.paragraphs_sentences_and_whitespace = Datasource( self._name + ".paragraphs_sentences_and_whitespace", paragraphs_sentences_and_whitespace.segment, depends_on=[self.tokens] ) """ A list of paragraphs, sentences, and whitespaces as segments. See :class:`deltas.segmenters.Segment` and :class:`deltas.segmenters.MatchableSegment`. """ self.token_frequency = frequencies.table( self.tokens, name=self._name + ".token_frequency" ) """ A frequency table of all tokens. """ self.numbers = self.tokens_in_types( {'number'}, name=self._name + ".numbers" ) """ A list of numeric tokens """ self.number_frequency = frequencies.table( self.numbers, name=self._name + ".number_frequency" ) """ A frequency table of number tokens. """ self.whitespaces = self.tokens_in_types( {'whitespace'}, name=self._name + ".whitespaces" ) """ A list of whitespace tokens """ self.whitespace_frequency = frequencies.table( self.whitespaces, name=self._name + ".whitespace_frequency" ) """ A frequency table of whichspace tokens. """ self.markups = self.tokens_in_types( {'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals'}, name=self._name + ".markups" ) """ A list of markup tokens """ self.markup_frequency = frequencies.table( self.markups, name=self._name + ".markup_frequency" ) """ A frequency table of markup tokens. """ self.cjks = self.tokens_in_types( {'cjk'}, name=self._name + ".cjks" ) """ A list of Chinese/Japanese/Korean tokens """ self.cjk_frequency = frequencies.table( self.cjks, name=self._name + ".cjk_frequency" ) """ A frequency table of cjk tokens. """ self.entities = self.tokens_in_types( {'entity'}, name=self._name + ".entities" ) """ A list of HTML entity tokens """ self.entity_frequency = frequencies.table( self.entities, name=self._name + ".entity_frequency" ) """ A frequency table of entity tokens. """ self.urls = self.tokens_in_types( {'url'}, name=self._name + ".urls" ) """ A list of URL tokens """ self.url_frequency = frequencies.table( self.urls, name=self._name + ".url_frequency" ) """ A frequency table of url tokens. """ self.words = self.tokens_in_types( {'word'}, name=self._name + ".words" ) """ A list of word tokens """ self.word_frequency = frequencies.table( mappers.lower_case(self.words), name=self._name + ".word_frequency" ) """ A frequency table of lower-cased word tokens. """ self.uppercase_words = filters.filter( is_uppercase_word, self.words, name=self._name + ".uppercase_words" ) """ A list of uppercase word tokens that are at least two characters long. """ self.uppercase_word_frequency = frequencies.table( self.uppercase_words, name=self._name + ".uppercase_word_frequency" ) """ A frequency table of uppercase word tokens that are at least two characters long. """ self.punctuations = self.tokens_in_types( {'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct'}, name=self._name + ".punctuations" ) """ A list of punctuation tokens """ self.punctuation_frequency = frequencies.table( self.punctuations, name=self._name + ".punctuation_frequency" ) """ A frequency table of punctuation tokens. """ self.breaks = self.tokens_in_types( {'break'}, name=self._name + ".breaks" ) """ A list of break tokens """ self.break_frequency = frequencies.table( self.breaks, name=self._name + ".break_frequency" ) """
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.operations = Datasource( self._name + ".operations", _process_operations, depends_on=[ self.revision.parent.paragraphs_sentences_and_whitespace, self.revision.paragraphs_sentences_and_whitespace, self.revision.parent.tokens, self.revision.tokens] ) """ Returns a tuple that describes the difference between the parent revision text and the current revision's text. The tuple contains three fields: * operations: `list` of :class:`deltas.Operation` * A tokens: `list` of `str` * B tokens: `list` of `str` """ self.segments_added = Datasource( self._name + ".segments_added", _process_segments_added, depends_on=[self.operations] ) """ Returns a list of all contiguous segments of tokens added in this revision. """ self.segments_removed = Datasource( self._name + ".segments_removed", _process_segments_removed, depends_on=[self.operations] ) """ Returns a list of all contiguous segments of tokens removed in this revision. """ self.tokens_added = Datasource( self._name + ".tokens_added", _process_tokens_added, depends_on=[self.operations] ) """ Constructs a :class:`revscoring.Datasource` that returns a list of all tokens added in this revision. """ self.tokens_removed = Datasource( self._name + ".tokens_removed", _process_tokens_removed, depends_on=[self.operations] ) """ Constructs a :class:`revscoring.Datasource` that returns a list of all tokens removed in this revision. """ self.numbers_added = self.tokens_added_in_types( {'number'}, name=self._name + ".numbers_added" ) """ A list of numeric tokens added in the edit """ self.numbers_removed = self.tokens_removed_in_types( {'number'}, name=self._name + ".numbers_removed" ) """ A list of numeric tokens removed in the edit """ self.whitespaces_added = self.tokens_added_in_types( {'whitespace'}, name=self._name + ".whitespaces_added" ) """ A list of whitespace tokens added in the edit """ self.whitespaces_removed = self.tokens_removed_in_types( {'whitespace'}, name=self._name + ".whitespaces_removed" ) """ A list of whitespace tokens removed in the edit """ self.markups_added = self.tokens_added_in_types( {'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals'}, name=self._name + ".markups_added" ) """ A list of markup tokens added in the edit """ self.markups_removed = self.tokens_removed_in_types( {'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals'}, name=self._name + ".markups_removed" ) """ A list of markup tokens removed in the edit """ self.cjks_added = self.tokens_added_in_types( {'cjk'}, name=self._name + ".cjks_added" ) """ A list of Chinese/Japanese/Korean tokens added in the edit """ self.cjks_removed = self.tokens_removed_in_types( {'cjk'}, name=self._name + ".cjks_removed" ) """ A list of Chinese/Japanese/Korean tokens removed in the edit """ self.entities_added = self.tokens_added_in_types( {'entity'}, name=self._name + ".entities_added" ) """ A list of HTML entity tokens added in the edit """ self.entities_removed = self.tokens_removed_in_types( {'entity'}, name=self._name + ".entities_removed" ) """ A list of HTML entity tokens removed in the edit """ self.urls_added = self.tokens_added_in_types( {'url'}, name=self._name + ".urls_added" ) """ A list of URL tokens rempved in the edit """ self.urls_removed = self.tokens_removed_in_types( {'url'}, name=self._name + ".urls_removed" ) """ A list of URL tokens added in the edit """ self.words_added = self.tokens_added_in_types( {'word'}, name=self._name + ".words_added" ) """ A list of word tokens added in the edit """ self.words_removed = self.tokens_removed_in_types( {'word'}, name=self._name + ".words_removed" ) """ A list of word tokens removed in the edit """ self.uppercase_words_added = filters.filter( is_uppercase_word, self.words_added, name=self._name + ".uppercase_words_added" ) """ A list of fully UPPERCASE word tokens added in the edit """ self.uppercase_words_removed = filters.filter( is_uppercase_word, self.words_removed, name=self._name + ".uppercase_words_removed" ) """ A list of fully UPPERCASE word tokens removed in the edit """ self.punctuations_added = self.tokens_added_in_types( {'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct'}, name=self._name + ".punctuations_added" ) """ A list of punctuation tokens added in the edit """ self.punctuations_removed = self.tokens_removed_in_types( {'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct'}, name=self._name + ".punctuations_removed" ) """ A list of punctuation tokens removed in the edit """ self.breaks_added = self.tokens_added_in_types( {'break'}, name=self._name + ".breaks_added" ) """ A list of break tokens added in the edit """ self.breaks_removed = self.tokens_removed_in_types( {'break'}, name=self._name + ".breaks_removed" ) """
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.operations = Datasource( self._name + ".operations", _process_operations, depends_on=[ self.revision.parent.paragraphs_sentences_and_whitespace, self.revision.paragraphs_sentences_and_whitespace, self.revision.parent.tokens, self.revision.tokens ]) """ Returns a tuple that describes the difference between the parent revision text and the current revision's text. The tuple contains three fields: * operations: `list` of :class:`deltas.Operation` * A tokens: `list` of `str` * B tokens: `list` of `str` """ self.segments_added = Datasource(self._name + ".segments_added", _process_segments_added, depends_on=[self.operations]) """ Returns a list of all contiguous segments of tokens added in this revision. """ self.segments_removed = Datasource(self._name + ".segments_removed", _process_segments_removed, depends_on=[self.operations]) """ Returns a list of all contiguous segments of tokens removed in this revision. """ self.tokens_added = Datasource(self._name + ".tokens_added", _process_tokens_added, depends_on=[self.operations]) """ Constructs a :class:`revscoring.Datasource` that returns a list of all tokens added in this revision. """ self.tokens_removed = Datasource(self._name + ".tokens_removed", _process_tokens_removed, depends_on=[self.operations]) """ Constructs a :class:`revscoring.Datasource` that returns a list of all tokens removed in this revision. """ self.numbers_added = self.tokens_added_in_types({'number'}, name=self._name + ".numbers_added") """ A list of numeric tokens added in the edit """ self.numbers_removed = self.tokens_removed_in_types({'number'}, name=self._name + ".numbers_removed") """ A list of numeric tokens removed in the edit """ self.whitespaces_added = self.tokens_added_in_types( {'whitespace'}, name=self._name + ".whitespaces_added") """ A list of whitespace tokens added in the edit """ self.whitespaces_removed = self.tokens_removed_in_types( {'whitespace'}, name=self._name + ".whitespaces_removed") """ A list of whitespace tokens removed in the edit """ self.markups_added = self.tokens_added_in_types( { 'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals' }, name=self._name + ".markups_added") """ A list of markup tokens added in the edit """ self.markups_removed = self.tokens_removed_in_types( { 'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals' }, name=self._name + ".markups_removed") """ A list of markup tokens removed in the edit """ self.cjks_added = self.tokens_added_in_types({'cjk'}, name=self._name + ".cjks_added") """ A list of Chinese/Japanese/Korean tokens added in the edit """ self.cjks_removed = self.tokens_removed_in_types({'cjk'}, name=self._name + ".cjks_removed") """ A list of Chinese/Japanese/Korean tokens removed in the edit """ self.entities_added = self.tokens_added_in_types({'entity'}, name=self._name + ".entities_added") """ A list of HTML entity tokens added in the edit """ self.entities_removed = self.tokens_removed_in_types( {'entity'}, name=self._name + ".entities_removed") """ A list of HTML entity tokens removed in the edit """ self.urls_added = self.tokens_added_in_types({'url'}, name=self._name + ".urls_added") """ A list of URL tokens rempved in the edit """ self.urls_removed = self.tokens_removed_in_types({'url'}, name=self._name + ".urls_removed") """ A list of URL tokens added in the edit """ self.words_added = self.tokens_added_in_types({'word'}, name=self._name + ".words_added") """ A list of word tokens added in the edit """ self.words_removed = self.tokens_removed_in_types({'word'}, name=self._name + ".words_removed") """ A list of word tokens removed in the edit """ self.uppercase_words_added = filters.filter(is_uppercase_word, self.words_added, name=self._name + ".uppercase_words_added") """ A list of fully UPPERCASE word tokens added in the edit """ self.uppercase_words_removed = filters.filter( is_uppercase_word, self.words_removed, name=self._name + ".uppercase_words_removed") """ A list of fully UPPERCASE word tokens removed in the edit """ self.punctuations_added = self.tokens_added_in_types( { 'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct' }, name=self._name + ".punctuations_added") """ A list of punctuation tokens added in the edit """ self.punctuations_removed = self.tokens_removed_in_types( { 'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct' }, name=self._name + ".punctuations_removed") """ A list of punctuation tokens removed in the edit """ self.breaks_added = self.tokens_added_in_types({'break'}, name=self._name + ".breaks_added") """ A list of break tokens added in the edit """ self.breaks_removed = self.tokens_removed_in_types({'break'}, name=self._name + ".breaks_removed") """
images_in_tags + infobox_images # References def filter_paragraphs_without_ref_tags(segment): "Check to see if we have at least 10 words and no refs" words = 0 refs = 0 for t in segment.tokens(): words += t.type == "word" refs += t.type in ("ref_open", "ref_close", "ref_singleton") return words > 10 and refs == 0 paragraphs_without_refs = filters.filter( filter_paragraphs_without_ref_tags, wikitext.revision.datasources.paragraphs_sentences_and_whitespace, name="ptwiki.revision.paragraphs_without_refs") paragraphs_without_refs_total_length = aggregators.sum( mappers.map(len, mappers.map(str, paragraphs_without_refs)), name="ptwiki.revision.paragraphs_without_refs_total_length") # Wikipedia:Manual of style/Words to watch words_to_watch_count = portuguese.words_to_watch.revision.matches local_wiki = [ all_images, all_images / max(wikitext.revision.content_chars, 1), category_links, category_links / max(wikitext.revision.content_chars, 1), all_ref_tags, all_ref_tags / max(wikitext.revision.content_chars, 1), all_cite_templates, all_cite_templates / max(wikitext.revision.content_chars, 1),
def __init__(self, name, revision_datasources): super().__init__(name, revision_datasources) self.tokens = tokenized(revision_datasources.text) """ A list of all tokens """ self.paragraphs_sentences_and_whitespace = Datasource( self._name + ".paragraphs_sentences_and_whitespace", paragraphs_sentences_and_whitespace.segment, depends_on=[self.tokens]) """ A list of paragraphs, sentences, and whitespaces as segments. See :class:`deltas.segmenters.Segment` and :class:`deltas.segmenters.MatchableSegment`. """ self.token_frequency = frequencies.table(self.tokens, name=self._name + ".token_frequency") """ A frequency table of all tokens. """ self.numbers = self.tokens_in_types({'number'}, name=self._name + ".numbers") """ A list of numeric tokens """ self.number_frequency = frequencies.table(self.numbers, name=self._name + ".number_frequency") """ A frequency table of number tokens. """ self.whitespaces = self.tokens_in_types({'whitespace'}, name=self._name + ".whitespaces") """ A list of whitespace tokens """ self.whitespace_frequency = frequencies.table(self.whitespaces, name=self._name + ".whitespace_frequency") """ A frequency table of whichspace tokens. """ self.markups = self.tokens_in_types( { 'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals' }, name=self._name + ".markups") """ A list of markup tokens """ self.markup_frequency = frequencies.table(self.markups, name=self._name + ".markup_frequency") """ A frequency table of markup tokens. """ self.cjks = self.tokens_in_types({'cjk'}, name=self._name + ".cjks") """ A list of Chinese/Japanese/Korean tokens """ self.cjk_frequency = frequencies.table(self.cjks, name=self._name + ".cjk_frequency") """ A frequency table of cjk tokens. """ self.entities = self.tokens_in_types({'entity'}, name=self._name + ".entities") """ A list of HTML entity tokens """ self.entity_frequency = frequencies.table(self.entities, name=self._name + ".entity_frequency") """ A frequency table of entity tokens. """ self.urls = self.tokens_in_types({'url'}, name=self._name + ".urls") """ A list of URL tokens """ self.url_frequency = frequencies.table(self.urls, name=self._name + ".url_frequency") """ A frequency table of url tokens. """ self.words = self.tokens_in_types({'word'}, name=self._name + ".words") """ A list of word tokens """ self.word_frequency = frequencies.table(mappers.lower_case(self.words), name=self._name + ".word_frequency") """ A frequency table of lower-cased word tokens. """ self.uppercase_words = filters.filter(is_uppercase_word, self.words, name=self._name + ".uppercase_words") """ A list of uppercase word tokens that are at least two characters long. """ self.uppercase_word_frequency = frequencies.table( self.uppercase_words, name=self._name + ".uppercase_word_frequency") """ A frequency table of uppercase word tokens that are at least two characters long. """ self.punctuations = self.tokens_in_types( { 'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct' }, name=self._name + ".punctuations") """ A list of punctuation tokens """ self.punctuation_frequency = frequencies.table( self.punctuations, name=self._name + ".punctuation_frequency") """ A frequency table of punctuation tokens. """ self.breaks = self.tokens_in_types({'break'}, name=self._name + ".breaks") """ A list of break tokens """ self.break_frequency = frequencies.table(self.breaks, name=self._name + ".break_frequency") """