class TestTextCleaner(unittest.TestCase): def setUp(self): self.textcleaner = TextCleaner() def test_clean(self): t = (";D :E born in the U.S.A.! Yeah. A. :-D", "I feel sick today :S", ":My favourite TV series: The Big Bang Theory", "F.B.K.") e = ("born in the! Yeah. ", "I feel sick today ", ":My favourite TV series: The Big Bang Theory", "") for i, s in enumerate(t): self.assertEquals(self.textcleaner.clean_text(s), e[i]) def test_clean_HTML(self): t = ("<div><b>42</b> is the <a href='#'>answer</a></div>", "<span>Hello World</span>", "<!-- I mustn't read this --> Are comments being filtered?", "I don't & like HTML entities &dioji; LO&ppp;L") e = ("42 is the answer", "Hello World", " Are comments being filtered?", "I don't like HTML entities LOL") for i, s in enumerate(t): self.assertEquals(self.textcleaner.clean_html_syntax(s), e[i]) def test_clean_wiki(self): t = ("Less taxes for everyone! {{citation needed}}", "look here http://google.it/a/lol.html lol :D http://wiki.com", "drink a relaxing [Jack Daniel's]", "If you want some [Wikipedia:Help] look here", "| name =goofy, |city =New York", "[File:Case di Volano.jpg|thumb|250px|Volano vista da un dosso]", "vicino a [[Calliano (Trentino-Alto Adige)|Calliano]] c'e' un", "[[nap:test:Volano (TN)]]", "andare in S.Marco", "[[Pagina|link fatto male poiche' manca una parentesi quadra " \ "e c'e' caratteri strani dentro? ;)]", "[http://www.nps.gov/ Oklahoma City National Memorial] National") e = ("Less taxes for everyone! ", "look here lol :D ", "drink a relaxing Jack Daniel's", "If you want some Help look here", "", "Volano vista da un dosso", "vicino a Calliano c'e' un", "Volano (TN)", "andare in S.Marco", "link fatto male poiche' manca una parentesi quadra " \ "e c'e' caratteri strani dentro? ;)", " Oklahoma City National Memorial National") for i, s in enumerate(t): self.assertEquals(self.textcleaner.clean_wiki_syntax(s), e[i])
def __init__(self, **kwargs): super(HistoryRevisionsPageProcessor, self).__init__(**kwargs) self.textcleaner = TextCleaner(kwargs["userns"]) self.queue = [] fields = ["timestamp", "lang", "title", "type", "text"] self.csv_writer = csv.DictWriter(self.output, fieldnames=fields, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
def __init__(self, **kwargs): super(HistoryRevisionsPageProcessor, self).__init__(**kwargs) self.textcleaner = TextCleaner(kwargs["userns"]) self.queue = [] fields = ["timestamp", "lang", "title", "type", "user", "text"] self.csv_writer = csv.DictWriter(self.output, fieldnames=fields, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] page_title output_file") p.add_option('-l', '--lang', action="store", dest="lang", default="en", help="Wikipedia language") p.add_option('-c', '--clean', action="store_true", dest="clean", help="Clean wiki syntax / HTML") opts, files = p.parse_args() if len(files) != 2: p.error("Wrong parameters") logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_writer = csv.writer(open(files[1], "w"), delimiter="\t", quotechar='"', quoting=csv.QUOTE_ALL) textcleaner = None if opts.clean: textcleaner = TextCleaner() get_revisions(files[0], csv_writer, opts.lang, textcleaner)
class PyWC: """ PyWC is a python class for word counting and text analisys. """ # Global proprieties (of the whole source file) categories = None # Dictionary's categories keywords = None # Dictionary's keywords/regex delimiter = "\t" # CSV delimiter quotechar = '"' # CSV quotechar csv_out = sys.stdout # CSV output queue = [] # Flushing queue max_char_limit = 100000 # Max chars per line ignorecols = [] # List of columns of the src file to ignore csv_writer = None # Csv writer handler id_col = 0 # Number of the id column dic_regex = False # Use dictionary made of regex flush_n = 100 # Number of pieces of text to store clean_wiki = None # Clean wiki syntax clean_html = None # Clean HTML percentage = False # Output as percentage tuning = False # Set tuning mode (no conditional dictionary but # a lot faster!) detailed = False # detailed PYWC output per keyword rwords = re.compile(r"[\w']+") rqmarks = re.compile(r"\?") textcleaner = TextCleaner() cond_exp_regex = (re.compile(r"<([\w']+)>(\w+)(\/(\w+)?)?"), re.compile(r"\(([\w\s]+)\)(\w+)(\/(\w+)?)?")) # Local proprieties (of every column of the source file) _id = None # Line ID _results = None # Dictionary where keys are cat ids and # values are counters _qmarks = None # Number of question marks _unique = None # Set of unique words, len() of the set is the number # of unique words _dic = None # Number of words in dic _sixltr = None # Number of words > 6 letters _total = None # Number of total words per column _text = None # Current text to analize _next_word = None # Next word that has to be analized _prev_cat = None # Categories of the last word that has been analized # (useful for conditional exps) _counter = 0 # Generic counter of how many pieces of # text have been analized _keys = None _detailed_data = None # data only for detailed output def __init__(self, **kwargs): self.__dict__ = kwargs def delattrs(self, attrs): """ Frees memory deleting useless attributes of the object """ for attr in attrs: try: delattr(self, attr) except AttributeError: pass def _gen_keyword(self, content): """ Generator for self.keywords (dictionary made of of regexps as keys and thier categories as values) """ for line in content[2].split("\n")[1:-1]: # Comments start with // if line and not line.startswith("//"): line = line.split("\t") # If not using a dictionary made of regexps # it fixes the keyword for regexping # "^" is added at the beginning of every keyword # If keyword doesn't ends with "*", a "$" is added # bad -> ^bad$ matches "bad" but not "badass" # bad* -> ^bad matches "bad" and "badass" if not self.dic_regex: line[0] = "".join(["\\b", line[0]]) try: if (line[0][-1] == "*"): line[0] = line[0][:-1] else: line[0] = "".join([line[0], "\\b"]) except IndexError: continue yield (re.compile(line[0], re.IGNORECASE), line[1:]) def set_dic(self, dic): """ Receives as input the dictionary filename. Reads the dictionary file and populates self.categories and self.keywords """ f = open(dic, 'r') content = f.read() content = content.split("%") if len(content) != 3: raise ValueError("Invalid dic file") # Creates a dictionary where category ids are the keys # and category names are the values. # Splits content at first by new line, then by tab self.categories = dict((line.split("\t") \ for line in content[1].split("\n")[1:-1] if line)) # Creates a dictionary where the compiled regex is the key # and category ids are the values self.keywords = dict(x for x in self._gen_keyword(content)) def flush(self): """ Writes everything which is in the queue in the csv output file """ self.csv_writer.writerows(self.queue) self.queue = [] def save(self): """ Saves current piece of text that has been analized to the queue """ tmp = { "id": self._id, "qmarks": perc(self._qmarks, self._total, self.percentage), "unique": perc(len(self._unique), self._total, self.percentage), "dic": perc(self._dic, self._total, self.percentage), "sixltr": perc(self._sixltr, self._total, self.percentage), "total": self._total, "text": self._text } # Join of self.categories and self._results values for k, v in ((self.categories[x], \ perc(self._results[x], self._total, self.percentage)) \ for x in self.categories): tmp[k] = v self.queue.append(tmp) del tmp self._counter += 1 if self._counter % self.flush_n == 0: logging.info("### Flushing: %d", self._counter) self.flush() def parse_word(self, word): """ Parses a single word with the dictionary of regexps (self.keywords). For every regex that matches, it increments every category they belong to in self._result """ if not self.tuning: cat = [] for regex in self.keywords: if regex.search(word): if self.detailed: self._detailed_data[regex.pattern] += 1 for i in self.keywords[regex]: res = self.cond_exp_regex[0].match(i) if res: if self._next_word == res.group(1): cat.append(res.group(2)) elif res.group(4): cat.append(res.group(4)) continue res = self.cond_exp_regex[1].match(i) if res: if True in [c in self._prev_cat \ for c in res.group(1).split(" ")]: cat.append(res.group(2)) elif res.group(4): cat.append(res.group(4)) continue # If dictionary contains trailing tabs, # '' keys are saved. It skips them. if i: cat.append(i) for c in cat: try: self._results[c] += 1 except KeyError: logging.warn("Invalid category id %s", c) if len(cat) > 0: # Increment word in dictionary counter self._dic += 1 self._prev_cat = cat if len(word) > 6: # Increment word > 6 letters counter self._sixltr += 1 self._total += 1 self._unique.add(word) def parse_col(self, col): """ Reads a single cell of the csv file. It splits it into words and gives them to self.parse_word """ self.delattrs( ("_results", "_qmarks", "_unique", "_dic", "_sixltr", "_total", "_text", "_prev_word", "_prev cat", "_detailed_data")) self._text = col #logging.info("--------PRIMA-----------") #logging.info(self._text) #logging.info("-------------------") if self.clean_wiki or self.clean_html: self._text = self.textcleaner.clean_text(self._text) if self.clean_wiki: self._text = self.textcleaner.clean_wiki_syntax(self._text) if self.clean_html: self._text = self.textcleaner.clean_html_syntax(self._text) #logging.info("--------DOPO------------") #logging.info(self._text) #logging.info("-------------------") self._results = Counter() if self.detailed: self._detailed_data = Counter() self._qmarks = len([m for m in self.rqmarks.findall(self._text)]) self._unique = set() self._dic = 0 self._sixltr = 0 self._total = 0 # create a list of words (_no_ numbers) words = [word for word in self.rwords.findall(self._text) \ if not word.isdigit()] for i, word in enumerate(words): try: self._next_word = words[i + 1] except IndexError: self._next_word = "" self.parse_word(word) if self.tuning: for regex in self.keywords: occ = len(regex.findall(self._text)) if occ: for cat in self.keywords[regex]: if cat: try: self._results[cat] += occ except KeyError: logging.warn("Invalid category id %s", cat) if self.detailed: self._detailed_data[regex.pattern] += occ self._dic += occ def parse_line(self, line): """ Reads a single line of the csv file. Sets self._id and gives the cells that are not in the ignore list to self.parse_col """ self.delattrs(("_id")) self._id = line[self.id_col] for i, col in enumerate(line): if len(col) <= self.max_char_limit: if i != self.id_col and not i in self.ignorecols: self.parse_col(col) self.save() else: logging.warn( " Line %d:%d skipped " "because longer than %d chars", self._counter, i, self.max_char_limit) def start(self, src): """ It starts the file processing. To obtain a sensible output is recommended to run self.set_dic() before. It writes the output csv header and reads every line, passing it to self.parse_line """ # Creates a list of category names sorted by their ID. # Useful because Python dictionaries are not sorted objects! # Sorting like TAWC try: cat_names = [x[1] for x in sorted([(int(a), b) for a, b in \ self.categories.items()])] except ValueError: cat_names = [x[1] for x in sorted(self.categories.items())] self._keys = ["id"] + cat_names + [ "qmarks", "unique", "dic", "sixltr", "total", "text" ] self.csv_writer = csv.DictWriter(self.csv_out, delimiter=self.delimiter, fieldnames=self._keys, quotechar=self.quotechar) self.csv_writer.writeheader() csv_reader = csv.reader(src, delimiter=self.delimiter) for line in csv_reader: self.parse_line(line)
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dic input_file output_file") p.add_option('-l', '--lang', action="store", dest="lang", default="en", help="Wikipedia language") p.add_option('-n', '--edits', action="store", dest="edits", type=int, default=500, help="Edit number to consider") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_reader = csv.reader(open(files[1], "r")) textcleaner = TextCleaner() pywc = PyWC() pywc.set_dic(files[0]) try: cat_names = [ str(x[1]) for x in sorted([(int(a), b) for a, b in pywc.categories.items()]) ] except ValueError: cat_names = [str(x[1]) for x in sorted(pywc.categories.items())] reverse_categories = {} for key, value in pywc.categories.iteritems(): reverse_categories[value] = key arcsin_fields = ["%s_arcsin" % key for key in cat_names] fields = ["title", "total_edits", "unique_editors", "traumatic", "non_traumatic", "natural", "human", "len", "len_cleaned"] + \ cat_names + arcsin_fields + \ ["qmarks", "unique", "dic", "sixltr", "total"] csv_writer = csv.DictWriter(open(files[2], "w"), fields) csv_writer.writeheader() for line in csv_reader: title, rev = line[0], opts.edits - 1 revision_id = find_revision_id(title, rev, opts.lang, startid=None) if revision_id is None: continue rev = get_revision(revision_id, opts.lang) cleaned_rev = textcleaner.clean_all(rev) cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True) pywc.parse_col(cleaned_rev) result = { "title": title, "total_edits": line[1], "unique_editors": line[2], "traumatic": line[3], "non_traumatic": line[4], "natural": line[5], "human": line[6], "len": len(rev.split()), "len_cleaned": len(cleaned_rev.split()), "qmarks": pywc._qmarks, "unique": len(pywc._unique), "dic": pywc._dic, "sixltr": pywc._sixltr, "total": pywc._total, } for key, val in reverse_categories.iteritems(): score = perc(pywc._results[val], pywc._total) * 100 arcsin = calc_arcsin(score) result[key] = score # percentage results result["%s_arcsin" % key] = arcsin # arcsin results csv_writer.writerow(result)
class HistoryRevisionsPageProcessor(HistoryPageProcessor): output = None queue = None _skip = None _prev_text = "" _text = None get_talks = True get_articles = True diff_timeout = 0.5 clean = None textcleaner = None def __init__(self, **kwargs): super(HistoryRevisionsPageProcessor, self).__init__(**kwargs) self.textcleaner = TextCleaner(userns) self.queue = [] f = open(self.output, 'w') self._keys = ["timestamp", "lang", "title", "type", "text"] self.csv_writer = csv.DictWriter(f, fieldnames=self._keys, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL) def flush(self): """ Flushes queue in the CSV output """ pages = [{'title': page['title'], 'lang': self.lang, 'timestamp': page['timestamp'], 'text': page['text'], 'type': page['type']} for page in self.queue] self.csv_writer.writerows(pages) self.queue = [] def save(self): """ Saves data to the queue. The queue is stored using self.flush() """ if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) page = {'title': smart_str(self._title), 'lang': self.lang, 'timestamp': self._date, 'text': smart_str(_diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0]), 'type': self._type} self.queue.append(page) self._prev_text = self._text def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text")) self._skip = False a_title = elem.text.split(':') if len(a_title) == 1 and self.get_articles: self._type = 'normal' self._title = a_title[0] elif len(a_title) == 2 and a_title[0] == self.talkns and \ self.get_talks: self._type = 'talk' self._title = a_title[1] else: self._skip = True if not self._skip: self._desired = self.is_desired(self._title) if not self._desired: self._skip = True else: logging.info('Start processing desired page %s (%s)', self._title, self._type) def process_timestamp(self, elem): if self._skip: return self._date = elem.text def process_text(self, elem): if self._skip: return self._text = elem.text self.save() def process_page(self, _): self.count += 1 if not self.count % 1000: logging.info(' ### Processed %d pages', self.count) self.delattr(("text")) if not self._skip: with Timr('Flushing %s' % self._title): self.flush() self._skip = False def process_redirect(self, _): # This class only considers pages that are in the desired file, # these pages must not be redirects self._skip = True raise ValueError("The page %s is a redirect. " % self._title + \ "Pages in the desired list must not be redirects.")
class HistoryRevisionsPageProcessor(HistoryPageProcessor): output = None queue = None _skip = None _prev_text = "" _text = None get_talks = True get_articles = True diff_timeout = 0.5 clean = None textcleaner = None rwords = re.compile(r"[\w']+") def __init__(self, **kwargs): super(HistoryRevisionsPageProcessor, self).__init__(**kwargs) self.textcleaner = TextCleaner(kwargs["userns"]) self.queue = [] fields = ["timestamp", "lang", "title", "type", "text"] self.csv_writer = csv.DictWriter(self.output, fieldnames=fields, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL) def flush(self): """ Flushes queue in the CSV output """ pages = [{ 'title': page['title'], 'lang': self.lang, 'timestamp': page['timestamp'], 'text': page['text'], 'type': page['type'] } for page in self.queue] self.csv_writer.writerows(pages) self.queue = [] def save(self): """ Saves data to the queue. The queue is stored using self.flush() """ if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] page = { 'title': smart_str(self._title), 'lang': self.lang, 'timestamp': self._date, 'text': smart_str(diff), 'type': self._type } self.queue.append(page) else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text")) self._skip = False a_title = elem.text.split(':') if len(a_title) == 1 and self.get_articles: self._type = 'normal' self._title = a_title[0] elif len(a_title) == 2 and a_title[0] == self.talkns and \ self.get_talks: self._type = 'talk' self._title = a_title[1] else: self._skip = True if not self._skip: self._desired = self.is_desired(self._title) if not self._desired: self._skip = True else: logging.info('Start processing desired page %s (%s)', self._title, self._type) def process_timestamp(self, elem): if self._skip: return self._date = elem.text def process_text(self, elem): if self._skip: return self._text = elem.text self.save() def process_page(self, _): self.count += 1 if not self.count % 1000: logging.info(' ### Processed %d pages', self.count) self.delattr(("text")) if not self._skip: with Timr('Flushing %s' % self._title): self.flush() self._skip = False def process_redirect(self, _): # This class only considers pages that are in the desired file, # these pages must not be redirects self._skip = True raise ValueError("The page %s is a redirect. " % self._title + \ "Pages in the desired list must not be redirects.")
class HistoryRevisionsPageProcessor(HistoryPageProcessor): output = None queue = None _skip = None _prev_text = "" _text = None _username = None _ip = None _sender = None _skip_revision = False diff_timeout = 0.5 clean = None textcleaner = None rwords = re.compile(r"[\w']+") def __init__(self, **kwargs): super(HistoryRevisionsPageProcessor, self).__init__(**kwargs) self.textcleaner = TextCleaner(kwargs["userns"]) self.queue = [] fields = ["timestamp", "lang", "title", "type", "user", "text"] self.csv_writer = csv.DictWriter(self.output, fieldnames=fields, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL) def flush(self): """ Flushes queue in the CSV output """ pages = [{'title': page['title'], 'lang': self.lang, 'timestamp': page['timestamp'], 'text': page['text'], 'user': page['user'], 'type': page['type']} for page in self.queue] self.csv_writer.writerows(pages) self.queue = [] def save(self): """ Saves data to the queue. The queue is stored using self.flush() """ if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] page = {'title': smart_str(self._title), 'lang': self.lang, 'timestamp': self._date, 'text': smart_str(diff), 'user': smart_str(self._sender), 'type': self._type} self.queue.append(page) else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text def process_username(self, elem): if self._skip_revision: return self._username = elem.text def process_ip(self, elem): if self._skip_revision: return self._ip = elem.text def process_contributor(self, contributor): if self._skip_revision: return if contributor is None: self._skip_revision = True self._sender = self._username or self._ip self.delattr(("_username", "_ip")) if not self._sender: self.counter_deleted += 1 self._skip_revision = True def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text", "_username", "_ip", "_sender")) self._skip = False a_title = elem.text.split(':') if len(a_title) == 2 and a_title[0] == self.talkns: self._type = 'talk' self._title = a_title[1] elif len(a_title) == 2 and a_title[0] == self.usertalkns: self._type = 'user talk' self._title = a_title[1] elif len(a_title) == 1: self._type = 'normal' self._title = a_title[0] if not self._title: self._skip = True if not self._skip: if not ("talk" in self._type): self._skip = True else: logging.info('Start processing page %s (%s)', self._title, self._type) def process_timestamp(self, elem): if self._skip: return self._date = elem.text def process_text(self, elem): if self._skip: return self._text = elem.text self.save() def process_page(self, _): self.count += 1 if not self.count % 1000: logging.info(' ### Processed %d pages', self.count) if not self._skip: with Timr('Flushing %s' % self._title): self.flush() self.delattr(("text")) self._skip = False def process_redirect(self, _): self._skip = True
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dic input_file output_file") p.add_option('-l', '--lang', action="store", dest="lang", default="en", help="Wikipedia language") p.add_option('-n', '--edits', action="store", dest="edits", type=int, default=500, help="Edit number to consider") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_reader = csv.reader(open(files[1], "r")) textcleaner = TextCleaner() pywc = PyWC() pywc.set_dic(files[0]) try: cat_names = [str(x[1]) for x in sorted([(int(a), b) for a, b in pywc.categories.items()])] except ValueError: cat_names = [str(x[1]) for x in sorted(pywc.categories.items())] reverse_categories = {} for key, value in pywc.categories.iteritems(): reverse_categories[value] = key arcsin_fields = ["%s_arcsin" % key for key in cat_names] fields = ["title", "total_edits", "unique_editors", "traumatic", "non_traumatic", "natural", "human", "len", "len_cleaned"] + \ cat_names + arcsin_fields + \ ["qmarks", "unique", "dic", "sixltr", "total"] csv_writer = csv.DictWriter(open(files[2], "w"), fields) csv_writer.writeheader() for line in csv_reader: title, rev = line[0], opts.edits - 1 revision_id = find_revision_id(title, rev, opts.lang, startid=None) if revision_id is None: continue rev = get_revision(revision_id, opts.lang) cleaned_rev = textcleaner.clean_all(rev) cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True) pywc.parse_col(cleaned_rev) result = { "title": title, "total_edits": line[1], "unique_editors": line[2], "traumatic": line[3], "non_traumatic": line[4], "natural": line[5], "human": line[6], "len": len(rev.split()), "len_cleaned": len(cleaned_rev.split()), "qmarks": pywc._qmarks, "unique": len(pywc._unique), "dic": pywc._dic, "sixltr": pywc._sixltr, "total": pywc._total, } for key, val in reverse_categories.iteritems(): score = perc(pywc._results[val], pywc._total) * 100 arcsin = calc_arcsin(score) result[key] = score # percentage results result["%s_arcsin" % key] = arcsin # arcsin results csv_writer.writerow(result)
def setUp(self): self.textcleaner = TextCleaner()
class HistoryRevisionsPageProcessor(HistoryPageProcessor): output = None queue = None _skip = None _prev_text = "" _text = None _username = None _ip = None _sender = None _skip_revision = False diff_timeout = 0.5 clean = None textcleaner = None rwords = re.compile(r"[\w']+") def __init__(self, **kwargs): super(HistoryRevisionsPageProcessor, self).__init__(**kwargs) self.textcleaner = TextCleaner(kwargs["userns"]) self.queue = [] fields = ["timestamp", "lang", "title", "type", "user", "text"] self.csv_writer = csv.DictWriter(self.output, fieldnames=fields, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL) def flush(self): """ Flushes queue in the CSV output """ pages = [{ 'title': page['title'], 'lang': self.lang, 'timestamp': page['timestamp'], 'text': page['text'], 'user': page['user'], 'type': page['type'] } for page in self.queue] self.csv_writer.writerows(pages) self.queue = [] def save(self): """ Saves data to the queue. The queue is stored using self.flush() """ if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] page = { 'title': smart_str(self._title), 'lang': self.lang, 'timestamp': self._date, 'text': smart_str(diff), 'user': smart_str(self._sender), 'type': self._type } self.queue.append(page) else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text def process_username(self, elem): if self._skip_revision: return self._username = elem.text def process_ip(self, elem): if self._skip_revision: return self._ip = elem.text def process_contributor(self, contributor): if self._skip_revision: return if contributor is None: self._skip_revision = True self._sender = self._username or self._ip self.delattr(("_username", "_ip")) if not self._sender: self.counter_deleted += 1 self._skip_revision = True def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text", "_username", "_ip", "_sender")) self._skip = False a_title = elem.text.split(':') if len(a_title) == 2 and a_title[0] == self.talkns: self._type = 'talk' self._title = a_title[1] elif len(a_title) == 2 and a_title[0] == self.usertalkns: self._type = 'user talk' self._title = a_title[1] elif len(a_title) == 1: self._type = 'normal' self._title = a_title[0] if not self._title: self._skip = True if not self._skip: if not ("talk" in self._type): self._skip = True else: logging.info('Start processing page %s (%s)', self._title, self._type) def process_timestamp(self, elem): if self._skip: return self._date = elem.text def process_text(self, elem): if self._skip: return self._text = elem.text self.save() def process_page(self, _): self.count += 1 if not self.count % 1000: logging.info(' ### Processed %d pages', self.count) if not self._skip: with Timr('Flushing %s' % self._title): self.flush() self.delattr(("text")) self._skip = False def process_redirect(self, _): self._skip = True