def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dic input_file output_file") p.add_option('-l', '--lang', action="store", dest="lang", default="en", help="Wikipedia language") p.add_option('-n', '--edits', action="store", dest="edits", type=int, default=500, help="Edit number to consider") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_reader = csv.reader(open(files[1], "r")) textcleaner = TextCleaner() pywc = PyWC() pywc.set_dic(files[0]) try: cat_names = [ str(x[1]) for x in sorted([(int(a), b) for a, b in pywc.categories.items()]) ] except ValueError: cat_names = [str(x[1]) for x in sorted(pywc.categories.items())] reverse_categories = {} for key, value in pywc.categories.iteritems(): reverse_categories[value] = key arcsin_fields = ["%s_arcsin" % key for key in cat_names] fields = ["title", "total_edits", "unique_editors", "traumatic", "non_traumatic", "natural", "human", "len", "len_cleaned"] + \ cat_names + arcsin_fields + \ ["qmarks", "unique", "dic", "sixltr", "total"] csv_writer = csv.DictWriter(open(files[2], "w"), fields) csv_writer.writeheader() for line in csv_reader: title, rev = line[0], opts.edits - 1 revision_id = find_revision_id(title, rev, opts.lang, startid=None) if revision_id is None: continue rev = get_revision(revision_id, opts.lang) cleaned_rev = textcleaner.clean_all(rev) cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True) pywc.parse_col(cleaned_rev) result = { "title": title, "total_edits": line[1], "unique_editors": line[2], "traumatic": line[3], "non_traumatic": line[4], "natural": line[5], "human": line[6], "len": len(rev.split()), "len_cleaned": len(cleaned_rev.split()), "qmarks": pywc._qmarks, "unique": len(pywc._unique), "dic": pywc._dic, "sixltr": pywc._sixltr, "total": pywc._total, } for key, val in reverse_categories.iteritems(): score = perc(pywc._results[val], pywc._total) * 100 arcsin = calc_arcsin(score) result[key] = score # percentage results result["%s_arcsin" % key] = arcsin # arcsin results csv_writer.writerow(result)
class HistoryRevisionsPageProcessor(HistoryPageProcessor): output = None queue = None _skip = None _prev_text = "" _text = None get_talks = True get_articles = True diff_timeout = 0.5 clean = None textcleaner = None def __init__(self, **kwargs): super(HistoryRevisionsPageProcessor, self).__init__(**kwargs) self.textcleaner = TextCleaner(userns) self.queue = [] f = open(self.output, 'w') self._keys = ["timestamp", "lang", "title", "type", "text"] self.csv_writer = csv.DictWriter(f, fieldnames=self._keys, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL) def flush(self): """ Flushes queue in the CSV output """ pages = [{'title': page['title'], 'lang': self.lang, 'timestamp': page['timestamp'], 'text': page['text'], 'type': page['type']} for page in self.queue] self.csv_writer.writerows(pages) self.queue = [] def save(self): """ Saves data to the queue. The queue is stored using self.flush() """ if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) page = {'title': smart_str(self._title), 'lang': self.lang, 'timestamp': self._date, 'text': smart_str(_diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0]), 'type': self._type} self.queue.append(page) self._prev_text = self._text def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text")) self._skip = False a_title = elem.text.split(':') if len(a_title) == 1 and self.get_articles: self._type = 'normal' self._title = a_title[0] elif len(a_title) == 2 and a_title[0] == self.talkns and \ self.get_talks: self._type = 'talk' self._title = a_title[1] else: self._skip = True if not self._skip: self._desired = self.is_desired(self._title) if not self._desired: self._skip = True else: logging.info('Start processing desired page %s (%s)', self._title, self._type) def process_timestamp(self, elem): if self._skip: return self._date = elem.text def process_text(self, elem): if self._skip: return self._text = elem.text self.save() def process_page(self, _): self.count += 1 if not self.count % 1000: logging.info(' ### Processed %d pages', self.count) self.delattr(("text")) if not self._skip: with Timr('Flushing %s' % self._title): self.flush() self._skip = False def process_redirect(self, _): # This class only considers pages that are in the desired file, # these pages must not be redirects self._skip = True raise ValueError("The page %s is a redirect. " % self._title + \ "Pages in the desired list must not be redirects.")
class HistoryRevisionsPageProcessor(HistoryPageProcessor): output = None queue = None _skip = None _prev_text = "" _text = None get_talks = True get_articles = True diff_timeout = 0.5 clean = None textcleaner = None rwords = re.compile(r"[\w']+") def __init__(self, **kwargs): super(HistoryRevisionsPageProcessor, self).__init__(**kwargs) self.textcleaner = TextCleaner(kwargs["userns"]) self.queue = [] fields = ["timestamp", "lang", "title", "type", "text"] self.csv_writer = csv.DictWriter(self.output, fieldnames=fields, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL) def flush(self): """ Flushes queue in the CSV output """ pages = [{ 'title': page['title'], 'lang': self.lang, 'timestamp': page['timestamp'], 'text': page['text'], 'type': page['type'] } for page in self.queue] self.csv_writer.writerows(pages) self.queue = [] def save(self): """ Saves data to the queue. The queue is stored using self.flush() """ if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] page = { 'title': smart_str(self._title), 'lang': self.lang, 'timestamp': self._date, 'text': smart_str(diff), 'type': self._type } self.queue.append(page) else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text")) self._skip = False a_title = elem.text.split(':') if len(a_title) == 1 and self.get_articles: self._type = 'normal' self._title = a_title[0] elif len(a_title) == 2 and a_title[0] == self.talkns and \ self.get_talks: self._type = 'talk' self._title = a_title[1] else: self._skip = True if not self._skip: self._desired = self.is_desired(self._title) if not self._desired: self._skip = True else: logging.info('Start processing desired page %s (%s)', self._title, self._type) def process_timestamp(self, elem): if self._skip: return self._date = elem.text def process_text(self, elem): if self._skip: return self._text = elem.text self.save() def process_page(self, _): self.count += 1 if not self.count % 1000: logging.info(' ### Processed %d pages', self.count) self.delattr(("text")) if not self._skip: with Timr('Flushing %s' % self._title): self.flush() self._skip = False def process_redirect(self, _): # This class only considers pages that are in the desired file, # these pages must not be redirects self._skip = True raise ValueError("The page %s is a redirect. " % self._title + \ "Pages in the desired list must not be redirects.")
class HistoryRevisionsPageProcessor(HistoryPageProcessor): output = None queue = None _skip = None _prev_text = "" _text = None _username = None _ip = None _sender = None _skip_revision = False diff_timeout = 0.5 clean = None textcleaner = None rwords = re.compile(r"[\w']+") def __init__(self, **kwargs): super(HistoryRevisionsPageProcessor, self).__init__(**kwargs) self.textcleaner = TextCleaner(kwargs["userns"]) self.queue = [] fields = ["timestamp", "lang", "title", "type", "user", "text"] self.csv_writer = csv.DictWriter(self.output, fieldnames=fields, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL) def flush(self): """ Flushes queue in the CSV output """ pages = [{'title': page['title'], 'lang': self.lang, 'timestamp': page['timestamp'], 'text': page['text'], 'user': page['user'], 'type': page['type']} for page in self.queue] self.csv_writer.writerows(pages) self.queue = [] def save(self): """ Saves data to the queue. The queue is stored using self.flush() """ if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] page = {'title': smart_str(self._title), 'lang': self.lang, 'timestamp': self._date, 'text': smart_str(diff), 'user': smart_str(self._sender), 'type': self._type} self.queue.append(page) else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text def process_username(self, elem): if self._skip_revision: return self._username = elem.text def process_ip(self, elem): if self._skip_revision: return self._ip = elem.text def process_contributor(self, contributor): if self._skip_revision: return if contributor is None: self._skip_revision = True self._sender = self._username or self._ip self.delattr(("_username", "_ip")) if not self._sender: self.counter_deleted += 1 self._skip_revision = True def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text", "_username", "_ip", "_sender")) self._skip = False a_title = elem.text.split(':') if len(a_title) == 2 and a_title[0] == self.talkns: self._type = 'talk' self._title = a_title[1] elif len(a_title) == 2 and a_title[0] == self.usertalkns: self._type = 'user talk' self._title = a_title[1] elif len(a_title) == 1: self._type = 'normal' self._title = a_title[0] if not self._title: self._skip = True if not self._skip: if not ("talk" in self._type): self._skip = True else: logging.info('Start processing page %s (%s)', self._title, self._type) def process_timestamp(self, elem): if self._skip: return self._date = elem.text def process_text(self, elem): if self._skip: return self._text = elem.text self.save() def process_page(self, _): self.count += 1 if not self.count % 1000: logging.info(' ### Processed %d pages', self.count) if not self._skip: with Timr('Flushing %s' % self._title): self.flush() self.delattr(("text")) self._skip = False def process_redirect(self, _): self._skip = True
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dic input_file output_file") p.add_option('-l', '--lang', action="store", dest="lang", default="en", help="Wikipedia language") p.add_option('-n', '--edits', action="store", dest="edits", type=int, default=500, help="Edit number to consider") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_reader = csv.reader(open(files[1], "r")) textcleaner = TextCleaner() pywc = PyWC() pywc.set_dic(files[0]) try: cat_names = [str(x[1]) for x in sorted([(int(a), b) for a, b in pywc.categories.items()])] except ValueError: cat_names = [str(x[1]) for x in sorted(pywc.categories.items())] reverse_categories = {} for key, value in pywc.categories.iteritems(): reverse_categories[value] = key arcsin_fields = ["%s_arcsin" % key for key in cat_names] fields = ["title", "total_edits", "unique_editors", "traumatic", "non_traumatic", "natural", "human", "len", "len_cleaned"] + \ cat_names + arcsin_fields + \ ["qmarks", "unique", "dic", "sixltr", "total"] csv_writer = csv.DictWriter(open(files[2], "w"), fields) csv_writer.writeheader() for line in csv_reader: title, rev = line[0], opts.edits - 1 revision_id = find_revision_id(title, rev, opts.lang, startid=None) if revision_id is None: continue rev = get_revision(revision_id, opts.lang) cleaned_rev = textcleaner.clean_all(rev) cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True) pywc.parse_col(cleaned_rev) result = { "title": title, "total_edits": line[1], "unique_editors": line[2], "traumatic": line[3], "non_traumatic": line[4], "natural": line[5], "human": line[6], "len": len(rev.split()), "len_cleaned": len(cleaned_rev.split()), "qmarks": pywc._qmarks, "unique": len(pywc._unique), "dic": pywc._dic, "sixltr": pywc._sixltr, "total": pywc._total, } for key, val in reverse_categories.iteritems(): score = perc(pywc._results[val], pywc._total) * 100 arcsin = calc_arcsin(score) result[key] = score # percentage results result["%s_arcsin" % key] = arcsin # arcsin results csv_writer.writerow(result)
class HistoryRevisionsPageProcessor(HistoryPageProcessor): output = None queue = None _skip = None _prev_text = "" _text = None _username = None _ip = None _sender = None _skip_revision = False diff_timeout = 0.5 clean = None textcleaner = None rwords = re.compile(r"[\w']+") def __init__(self, **kwargs): super(HistoryRevisionsPageProcessor, self).__init__(**kwargs) self.textcleaner = TextCleaner(kwargs["userns"]) self.queue = [] fields = ["timestamp", "lang", "title", "type", "user", "text"] self.csv_writer = csv.DictWriter(self.output, fieldnames=fields, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL) def flush(self): """ Flushes queue in the CSV output """ pages = [{ 'title': page['title'], 'lang': self.lang, 'timestamp': page['timestamp'], 'text': page['text'], 'user': page['user'], 'type': page['type'] } for page in self.queue] self.csv_writer.writerows(pages) self.queue = [] def save(self): """ Saves data to the queue. The queue is stored using self.flush() """ if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] page = { 'title': smart_str(self._title), 'lang': self.lang, 'timestamp': self._date, 'text': smart_str(diff), 'user': smart_str(self._sender), 'type': self._type } self.queue.append(page) else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text def process_username(self, elem): if self._skip_revision: return self._username = elem.text def process_ip(self, elem): if self._skip_revision: return self._ip = elem.text def process_contributor(self, contributor): if self._skip_revision: return if contributor is None: self._skip_revision = True self._sender = self._username or self._ip self.delattr(("_username", "_ip")) if not self._sender: self.counter_deleted += 1 self._skip_revision = True def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text", "_username", "_ip", "_sender")) self._skip = False a_title = elem.text.split(':') if len(a_title) == 2 and a_title[0] == self.talkns: self._type = 'talk' self._title = a_title[1] elif len(a_title) == 2 and a_title[0] == self.usertalkns: self._type = 'user talk' self._title = a_title[1] elif len(a_title) == 1: self._type = 'normal' self._title = a_title[0] if not self._title: self._skip = True if not self._skip: if not ("talk" in self._type): self._skip = True else: logging.info('Start processing page %s (%s)', self._title, self._type) def process_timestamp(self, elem): if self._skip: return self._date = elem.text def process_text(self, elem): if self._skip: return self._text = elem.text self.save() def process_page(self, _): self.count += 1 if not self.count % 1000: logging.info(' ### Processed %d pages', self.count) if not self._skip: with Timr('Flushing %s' % self._title): self.flush() self.delattr(("text")) self._skip = False def process_redirect(self, _): self._skip = True