def __init__(self, **kwargs): super(PyWCProcessor, self).__init__(**kwargs) self.dic = kwargs["dic"] self.pywc = PyWC(self.dic, self.output) self.pywc.tuning = True self.data = {} self.detailed_data = {}
class TestPyWC(unittest.TestCase): def setUp(self): self.pywc = PyWC() self.pywc.set_dic("tests/pywc/simple-dic.dic") def test_read_dic(self): self.assertEquals(len(self.pywc.categories), 4) self.assertEquals(len(self.pywc.keywords), 9) def test_output(self): expected = "".join([line for line in \ open("tests/pywc/pywc_expected.csv")]) self.pywc.csv_out = open("tests/pywc/pywc_result.csv", "w") src = open("tests/pywc/pywc_input.csv") self.pywc.start(src) self.pywc.flush() self.pywc.csv_out.close() result = "".join([line for line in open("tests/pywc/pywc_result.csv")]) self.assertEquals(expected, result)
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dic input_file output_file") p.add_option('-l', '--lang', action="store", dest="lang", default="en", help="Wikipedia language") p.add_option('-n', '--edits', action="store", dest="edits", type=int, default=500, help="Edit number to consider") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_reader = csv.reader(open(files[1], "r")) textcleaner = TextCleaner() pywc = PyWC() pywc.set_dic(files[0]) try: cat_names = [ str(x[1]) for x in sorted([(int(a), b) for a, b in pywc.categories.items()]) ] except ValueError: cat_names = [str(x[1]) for x in sorted(pywc.categories.items())] reverse_categories = {} for key, value in pywc.categories.iteritems(): reverse_categories[value] = key arcsin_fields = ["%s_arcsin" % key for key in cat_names] fields = ["title", "total_edits", "unique_editors", "traumatic", "non_traumatic", "natural", "human", "len", "len_cleaned"] + \ cat_names + arcsin_fields + \ ["qmarks", "unique", "dic", "sixltr", "total"] csv_writer = csv.DictWriter(open(files[2], "w"), fields) csv_writer.writeheader() for line in csv_reader: title, rev = line[0], opts.edits - 1 revision_id = find_revision_id(title, rev, opts.lang, startid=None) if revision_id is None: continue rev = get_revision(revision_id, opts.lang) cleaned_rev = textcleaner.clean_all(rev) cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True) pywc.parse_col(cleaned_rev) result = { "title": title, "total_edits": line[1], "unique_editors": line[2], "traumatic": line[3], "non_traumatic": line[4], "natural": line[5], "human": line[6], "len": len(rev.split()), "len_cleaned": len(cleaned_rev.split()), "qmarks": pywc._qmarks, "unique": len(pywc._unique), "dic": pywc._dic, "sixltr": pywc._sixltr, "total": pywc._total, } for key, val in reverse_categories.iteritems(): score = perc(pywc._results[val], pywc._total) * 100 arcsin = calc_arcsin(score) result[key] = score # percentage results result["%s_arcsin" % key] = arcsin # arcsin results csv_writer.writerow(result)
class PyWCProcessor(HistoryRevisionsPageProcessor): pywc = None namespaces = None data = None dic = None detailed_start = None detailed_end = None detailed_ns = None # revision related variables _username = None _ip = None _sender = None _skip_revision = False def __init__(self, **kwargs): super(PyWCProcessor, self).__init__(**kwargs) self.dic = kwargs["dic"] self.pywc = PyWC(self.dic, self.output) self.pywc.tuning = True self.data = {} self.detailed_data = {} def save(self): if self._skip_revision: return if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] self.pywc.parse_col(diff) unique = len(self.pywc._unique) inserted_words = self.pywc._total if 100 <= inserted_words < 1000 and unique < inserted_words / 10: continue if not self._type in self.data: self.data[self._type] = {} current = self.data[self._type] date_str = self._date.strftime("%Y/%m/%d") tmp = {"date": date_str, "qmarks": self.pywc._qmarks, "unique": unique, "dic": self.pywc._dic, "sixltr": self.pywc._sixltr, "total": self.pywc._total} for x in self.pywc.categories: tmp[x] = self.pywc._results[x] if not date_str in current: current[date_str] = tmp current[date_str]["edits"] = 1 else: for elem in tmp: if elem != "date": current[date_str][elem] += tmp[elem] current[date_str]["edits"] += 1 del tmp if self.pywc.detailed and self._type == self.detailed_ns: date_str = self._date.strftime("%Y/%m/%d") if not date_str in self.detailed_data: self.detailed_data[date_str] = defaultdict(dict) for keyword in self.pywc._detailed_data: occ = self.pywc._detailed_data[keyword] tmp = self.detailed_data[date_str][keyword] if not tmp: tmp = {} tmp["total"] = 0 tmp["pages"] = Counter() tmp["users"] = Counter() tmp["total"] += occ tmp["pages"][self._title] += occ tmp["users"][self._sender] += occ self.detailed_data[date_str][keyword] = tmp else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text def flush(self): for ns in self.data: for date in sorted(self.data[ns]): tmp = {"ns": ns, "date": date} tmp.update(self.data[ns][date]) self.pywc.csv_writer.writerow(tmp) for date in self.detailed_data: filename = "%s_detailed_%s" % (self.output.name, date.replace("/", "")) with open(filename, "w") as f: detailed_csv = csv.writer(f, delimiter="\t") for keyword in self.detailed_data[date]: current = self.detailed_data[date][keyword] top_pages = sorted(current["pages"].items(), key=itemgetter(1))[:20] top_users = sorted(current["users"].items(), key=itemgetter(1))[:20] tmp = [keyword, current["total"], top_pages, len(current["pages"]), top_users, len(current["users"])] detailed_csv.writerow(tmp) def process_page(self, _): self.count += 1 if not self.count % 1000: logging.info(' ### Processed %d pages', self.count) self.delattr(("text")) self._skip = False def process_redirect(self, _): self._skip = True pass def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text", "_username", "_ip")) if self._skip_revision: return self._skip = False print elem.text self._title = smart_str(elem.text) a_title = self._title.split(':') if len(a_title) == 1: self._type = "Normal" else: self._type = a_title[0] if a_title[0] in self.namespaces \ else "Normal" def process_timestamp(self, elem): if self._skip_revision: return revision_time = mwlib.ts2dt(elem.text) if ((self.detailed_end and revision_time > self.detailed_end) or (self.detailed_start and revision_time < self.detailed_start)): self._skip_revision = True else: self._date = revision_time del revision_time def process_contributor(self, contributor): if self._skip_revision: return if contributor is None: self._skip_revision = True self._sender = self._username or self._ip self.delattr(("_username", "_ip")) if not self._sender: self._skip_revision = True def process_revision(self, _): skip = self._skip_revision self._skip_revision = False if skip: return self.delattr(("_username", "_ip", "_date")) del skip def process_username(self, elem): if self._skip_revision: return self._username = elem.text def process_ip(self, elem): if self._skip_revision: return self._ip = elem.text
def setUp(self): self.pywc = PyWC() self.pywc.set_dic("tests/pywc/simple-dic.dic")
class PyWCProcessor(HistoryRevisionsPageProcessor): pywc = None namespaces = None data = None dic = None detailed_start = None detailed_end = None detailed_ns = None # revision related variables _username = None _ip = None _sender = None _skip_revision = False def __init__(self, **kwargs): super(PyWCProcessor, self).__init__(**kwargs) self.dic = kwargs["dic"] self.pywc = PyWC(self.dic, self.output) self.pywc.tuning = True self.data = {} self.detailed_data = {} def save(self): if self._skip_revision: return if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] self.pywc.parse_col(diff) unique = len(self.pywc._unique) inserted_words = self.pywc._total if inserted_words > 1000 or unique < inserted_words / 10: logging.warn("Vandalism detected! (%s)", self._date) self._prev_text = self._text return if not self._type in self.data: self.data[self._type] = {} current = self.data[self._type] date_str = self._date.strftime("%Y/%m/%d") tmp = {"date": date_str, "qmarks": self.pywc._qmarks, "unique": unique, "dic": self.pywc._dic, "sixltr": self.pywc._sixltr, "total": self.pywc._total} for x in self.pywc.categories: tmp[x] = self.pywc._results[x] if not date_str in current: current[date_str] = tmp current[date_str]["edits"] = 1 else: for elem in tmp: if elem != "date": current[date_str][elem] += tmp[elem] current[date_str]["edits"] += 1 del tmp if self.pywc.detailed and self._type == self.detailed_ns: date_str = self._date.strftime("%Y/%m/%d") if not date_str in self.detailed_data: self.detailed_data[date_str] = defaultdict(dict) for keyword in self.pywc._detailed_data: occ = self.pywc._detailed_data[keyword] tmp = self.detailed_data[date_str][keyword] if not tmp: tmp = {} tmp["total"] = 0 tmp["pages"] = Counter() tmp["users"] = Counter() tmp["total"] += occ tmp["pages"][self._title] += occ tmp["users"][self._sender] += occ self.detailed_data[date_str][keyword] = tmp else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text def flush(self): for ns in self.data: for date in sorted(self.data[ns]): tmp = {"ns": ns, "date": date} tmp.update(self.data[ns][date]) self.pywc.csv_writer.writerow(tmp) for date in self.detailed_data: filename = "%s_detailed_%s" % (self.output.name, date.replace("/", "")) with open(filename, "w") as f: detailed_csv = csv.writer(f, delimiter="\t") for keyword in self.detailed_data[date]: current = self.detailed_data[date][keyword] top_pages = sorted(current["pages"].items(), key=itemgetter(1))[:20] top_users = sorted(current["users"].items(), key=itemgetter(1))[:20] tmp = [keyword, current["total"], top_pages, len(current["pages"]), top_users, len(current["users"])] detailed_csv.writerow(tmp) def process_page(self, _): self.count += 1 if not self.count % 1000: logging.info(' ### Processed %d pages', self.count) self.delattr(("text")) self._skip = False def process_redirect(self, _): self._skip = True pass def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text", "_username", "_ip")) if self._skip_revision: return self._skip = False print elem.text self._title = smart_str(elem.text) a_title = self._title.split(':') if len(a_title) == 1: self._type = "Normal" else: self._type = a_title[0] if a_title[0] in self.namespaces \ else "Normal" def process_timestamp(self, elem): if self._skip_revision: return revision_time = mwlib.ts2dt(elem.text) if ((self.detailed_end and revision_time > self.detailed_end) or (self.detailed_start and revision_time < self.detailed_start)): self._skip_revision = True else: self._date = revision_time del revision_time def process_contributor(self, contributor): if self._skip_revision: return if contributor is None: self._skip_revision = True self._sender = self._username or self._ip self.delattr(("_username", "_ip")) if not self._sender: self._skip_revision = True def process_revision(self, _): skip = self._skip_revision self._skip_revision = False if skip: return self.delattr(("_username", "_ip", "_date")) del skip def process_username(self, elem): if self._skip_revision: return self._username = elem.text def process_ip(self, elem): if self._skip_revision: return self._ip = elem.text
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dic input_file output_file") p.add_option('-l', '--lang', action="store", dest="lang", default="en", help="Wikipedia language") p.add_option('-n', '--edits', action="store", dest="edits", type=int, default=500, help="Edit number to consider") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_reader = csv.reader(open(files[1], "r")) textcleaner = TextCleaner() pywc = PyWC() pywc.set_dic(files[0]) try: cat_names = [str(x[1]) for x in sorted([(int(a), b) for a, b in pywc.categories.items()])] except ValueError: cat_names = [str(x[1]) for x in sorted(pywc.categories.items())] reverse_categories = {} for key, value in pywc.categories.iteritems(): reverse_categories[value] = key arcsin_fields = ["%s_arcsin" % key for key in cat_names] fields = ["title", "total_edits", "unique_editors", "traumatic", "non_traumatic", "natural", "human", "len", "len_cleaned"] + \ cat_names + arcsin_fields + \ ["qmarks", "unique", "dic", "sixltr", "total"] csv_writer = csv.DictWriter(open(files[2], "w"), fields) csv_writer.writeheader() for line in csv_reader: title, rev = line[0], opts.edits - 1 revision_id = find_revision_id(title, rev, opts.lang, startid=None) if revision_id is None: continue rev = get_revision(revision_id, opts.lang) cleaned_rev = textcleaner.clean_all(rev) cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True) pywc.parse_col(cleaned_rev) result = { "title": title, "total_edits": line[1], "unique_editors": line[2], "traumatic": line[3], "non_traumatic": line[4], "natural": line[5], "human": line[6], "len": len(rev.split()), "len_cleaned": len(cleaned_rev.split()), "qmarks": pywc._qmarks, "unique": len(pywc._unique), "dic": pywc._dic, "sixltr": pywc._sixltr, "total": pywc._total, } for key, val in reverse_categories.iteritems(): score = perc(pywc._results[val], pywc._total) * 100 arcsin = calc_arcsin(score) result[key] = score # percentage results result["%s_arcsin" % key] = arcsin # arcsin results csv_writer.writerow(result)
class PyWCProcessor(HistoryRevisionsPageProcessor): pywc = None namespaces = None data = None dic = None def __init__(self, **kwargs): super(PyWCProcessor, self).__init__(**kwargs) self.dic = kwargs["dic"] self.pywc = PyWC(self.dic, self.output) self.pywc.tuning = True self.data = {} def save(self): if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] self.pywc.parse_col(diff) if not self.data.has_key(self._type): self.data[self._type] = {} current = self.data[self._type] date = mwlib.ts2dt(self._date) date_str = date.strftime("%Y/%m/%d") tmp = {"date": date_str, "qmarks": self.pywc._qmarks, "unique": len(self.pywc._unique), "dic": self.pywc._dic, "sixltr": self.pywc._sixltr, "total": self.pywc._total} for x in self.pywc.categories: tmp[x] = self.pywc._results[x] if not current.has_key(date_str): current[date_str] = tmp current[date_str]["edits"] = 1 else: for elem in tmp: if elem != "date": current[date_str][elem] += tmp[elem] current[date_str]["edits"] += 1 del tmp else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text def flush(self): for line in self.data: for date in sorted(self.data[line]): tmp = {"ns": line, "date": date} tmp.update(self.data[line][date]) self.pywc.csv_writer.writerow(tmp) def process_page(self, _): self.count += 1 if not self.count % 1000: logging.info(' ### Processed %d pages', self.count) self.delattr(("text")) self._skip = False def process_redirect(self, _): self._skip = True pass def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text")) self._skip = False print elem.text elem.text = smart_str(elem.text) a_title = elem.text.split(':') if len(a_title) == 1: self._type = "Normal" else: self._type = a_title[0] if a_title[0] in self.namespaces \ else "Normal"