Beispiel #1
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] dic input_file output_file")
    p.add_option('-l',
                 '--lang',
                 action="store",
                 dest="lang",
                 default="en",
                 help="Wikipedia language")
    p.add_option('-n',
                 '--edits',
                 action="store",
                 dest="edits",
                 type=int,
                 default=500,
                 help="Edit number to consider")
    opts, files = p.parse_args()
    if len(files) != 3:
        p.error("Wrong parameters")
    logging.basicConfig(stream=sys.stderr,
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    csv_reader = csv.reader(open(files[1], "r"))

    textcleaner = TextCleaner()
    pywc = PyWC()
    pywc.set_dic(files[0])

    try:
        cat_names = [
            str(x[1])
            for x in sorted([(int(a), b) for a, b in pywc.categories.items()])
        ]
    except ValueError:
        cat_names = [str(x[1]) for x in sorted(pywc.categories.items())]

    reverse_categories = {}
    for key, value in pywc.categories.iteritems():
        reverse_categories[value] = key

    arcsin_fields = ["%s_arcsin" % key for key in cat_names]

    fields = ["title", "total_edits", "unique_editors", "traumatic",
              "non_traumatic", "natural", "human", "len", "len_cleaned"] + \
             cat_names + arcsin_fields + \
             ["qmarks", "unique", "dic", "sixltr", "total"]

    csv_writer = csv.DictWriter(open(files[2], "w"), fields)

    csv_writer.writeheader()

    for line in csv_reader:
        title, rev = line[0], opts.edits - 1
        revision_id = find_revision_id(title, rev, opts.lang, startid=None)
        if revision_id is None:
            continue
        rev = get_revision(revision_id, opts.lang)

        cleaned_rev = textcleaner.clean_all(rev)
        cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True)

        pywc.parse_col(cleaned_rev)

        result = {
            "title": title,
            "total_edits": line[1],
            "unique_editors": line[2],
            "traumatic": line[3],
            "non_traumatic": line[4],
            "natural": line[5],
            "human": line[6],
            "len": len(rev.split()),
            "len_cleaned": len(cleaned_rev.split()),
            "qmarks": pywc._qmarks,
            "unique": len(pywc._unique),
            "dic": pywc._dic,
            "sixltr": pywc._sixltr,
            "total": pywc._total,
        }

        for key, val in reverse_categories.iteritems():
            score = perc(pywc._results[val], pywc._total) * 100
            arcsin = calc_arcsin(score)
            result[key] = score  # percentage results
            result["%s_arcsin" % key] = arcsin  # arcsin results

        csv_writer.writerow(result)
Beispiel #2
0
class PyWCProcessor(HistoryRevisionsPageProcessor):
    pywc = None
    namespaces = None
    data = None
    dic = None
    detailed_start = None
    detailed_end = None
    detailed_ns = None
    # revision related variables
    _username = None
    _ip = None
    _sender = None
    _skip_revision = False

    def __init__(self, **kwargs):
        super(PyWCProcessor, self).__init__(**kwargs)
        self.dic = kwargs["dic"]
        self.pywc = PyWC(self.dic, self.output)
        self.pywc.tuning = True
        self.data = {}
        self.detailed_data = {}

    def save(self):
        if self._skip_revision:
            return
        if self._text is None:  # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)
        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            self.pywc.parse_col(diff)

            unique = len(self.pywc._unique)
            inserted_words = self.pywc._total
            if 100 <= inserted_words < 1000 and unique < inserted_words / 10:
                continue

            if not self._type in self.data:
                self.data[self._type] = {}
            current = self.data[self._type]
            date_str = self._date.strftime("%Y/%m/%d")
            tmp = {"date": date_str,
                   "qmarks": self.pywc._qmarks,
                   "unique": unique,
                   "dic": self.pywc._dic,
                   "sixltr": self.pywc._sixltr,
                   "total": self.pywc._total}
            for x in self.pywc.categories:
                tmp[x] = self.pywc._results[x]

            if not date_str in current:
                current[date_str] = tmp
                current[date_str]["edits"] = 1
            else:
                for elem in tmp:
                    if elem != "date":
                        current[date_str][elem] += tmp[elem]
                current[date_str]["edits"] += 1
            del tmp

            if self.pywc.detailed and self._type == self.detailed_ns:
                date_str = self._date.strftime("%Y/%m/%d")
                if not date_str in self.detailed_data:
                    self.detailed_data[date_str] = defaultdict(dict)
                for keyword in self.pywc._detailed_data:
                    occ = self.pywc._detailed_data[keyword]
                    tmp = self.detailed_data[date_str][keyword]
                    if not tmp:
                        tmp = {}
                        tmp["total"] = 0
                        tmp["pages"] = Counter()
                        tmp["users"] = Counter()
                    tmp["total"] += occ
                    tmp["pages"][self._title] += occ
                    tmp["users"][self._sender] += occ
                    self.detailed_data[date_str][keyword] = tmp
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text

    def flush(self):
        for ns in self.data:
            for date in sorted(self.data[ns]):
                tmp = {"ns": ns, "date": date}
                tmp.update(self.data[ns][date])
                self.pywc.csv_writer.writerow(tmp)
        for date in self.detailed_data:
            filename = "%s_detailed_%s" % (self.output.name,
                                           date.replace("/", ""))
            with open(filename, "w") as f:
                detailed_csv = csv.writer(f, delimiter="\t")
                for keyword in self.detailed_data[date]:
                    current = self.detailed_data[date][keyword]
                    top_pages = sorted(current["pages"].items(),
                                       key=itemgetter(1))[:20]
                    top_users = sorted(current["users"].items(),
                                       key=itemgetter(1))[:20]
                    tmp = [keyword, current["total"], top_pages,
                           len(current["pages"]), top_users,
                           len(current["users"])]
                    detailed_csv.writerow(tmp)

    def process_page(self, _):
        self.count += 1
        if not self.count % 1000:
            logging.info(' ### Processed %d pages', self.count)
        self.delattr(("text"))
        self._skip = False

    def process_redirect(self, _):
        self._skip = True
        pass

    def process_title(self, elem):
        self.delattr(("_counter", "_type", "_title", "_skip", "_date",
                      "text", "_username", "_ip"))
        if self._skip_revision:
            return
        self._skip = False
        print elem.text
        self._title = smart_str(elem.text)
        a_title = self._title.split(':')
        if len(a_title) == 1:
            self._type = "Normal"
        else:
            self._type = a_title[0] if a_title[0] in self.namespaces \
                                    else "Normal"

    def process_timestamp(self, elem):
        if self._skip_revision:
            return
        revision_time = mwlib.ts2dt(elem.text)
        if ((self.detailed_end and revision_time > self.detailed_end) or
            (self.detailed_start and revision_time < self.detailed_start)):
            self._skip_revision = True
        else:
            self._date = revision_time
        del revision_time

    def process_contributor(self, contributor):
        if self._skip_revision:
            return

        if contributor is None:
            self._skip_revision = True
        self._sender = self._username or self._ip
        self.delattr(("_username", "_ip"))
        if not self._sender:
            self._skip_revision = True

    def process_revision(self, _):
        skip = self._skip_revision
        self._skip_revision = False
        if skip:
            return
        self.delattr(("_username", "_ip", "_date"))
        del skip

    def process_username(self, elem):
        if self._skip_revision:
            return
        self._username = elem.text

    def process_ip(self, elem):
        if self._skip_revision:
            return
        self._ip = elem.text
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] dic input_file output_file")
    p.add_option('-l', '--lang', action="store", dest="lang", default="en",
                 help="Wikipedia language")
    p.add_option('-n', '--edits', action="store", dest="edits", type=int,
                 default=500, help="Edit number to consider")
    opts, files = p.parse_args()
    if len(files) != 3:
        p.error("Wrong parameters")
    logging.basicConfig(stream=sys.stderr,
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    csv_reader = csv.reader(open(files[1], "r"))

    textcleaner = TextCleaner()
    pywc = PyWC()
    pywc.set_dic(files[0])

    try:
        cat_names = [str(x[1]) for x in sorted([(int(a), b) for a, b in
                     pywc.categories.items()])]
    except ValueError:
        cat_names = [str(x[1]) for x in sorted(pywc.categories.items())]

    reverse_categories = {}
    for key, value in pywc.categories.iteritems():
        reverse_categories[value] = key

    arcsin_fields = ["%s_arcsin" % key for key in cat_names]

    fields = ["title", "total_edits", "unique_editors", "traumatic",
              "non_traumatic", "natural", "human", "len", "len_cleaned"] + \
             cat_names + arcsin_fields + \
             ["qmarks", "unique", "dic", "sixltr", "total"]

    csv_writer = csv.DictWriter(open(files[2], "w"), fields)

    csv_writer.writeheader()

    for line in csv_reader:
        title, rev = line[0], opts.edits - 1
        revision_id = find_revision_id(title, rev, opts.lang, startid=None)
        if revision_id is None:
            continue
        rev = get_revision(revision_id, opts.lang)

        cleaned_rev = textcleaner.clean_all(rev)
        cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True)

        pywc.parse_col(cleaned_rev)

        result = {
            "title": title,
            "total_edits": line[1],
            "unique_editors": line[2],
            "traumatic": line[3],
            "non_traumatic": line[4],
            "natural": line[5],
            "human": line[6],
            "len": len(rev.split()),
            "len_cleaned": len(cleaned_rev.split()),
            "qmarks": pywc._qmarks,
            "unique": len(pywc._unique),
            "dic": pywc._dic,
            "sixltr": pywc._sixltr,
            "total": pywc._total,
        }

        for key, val in reverse_categories.iteritems():
            score = perc(pywc._results[val], pywc._total) * 100
            arcsin = calc_arcsin(score)
            result[key] = score  # percentage results
            result["%s_arcsin" % key] = arcsin  # arcsin results

        csv_writer.writerow(result)
Beispiel #4
0
class PyWCProcessor(HistoryRevisionsPageProcessor):
    pywc = None
    namespaces = None
    data = None
    dic = None
    detailed_start = None
    detailed_end = None
    detailed_ns = None
    # revision related variables
    _username = None
    _ip = None
    _sender = None
    _skip_revision = False

    def __init__(self, **kwargs):
        super(PyWCProcessor, self).__init__(**kwargs)
        self.dic = kwargs["dic"]
        self.pywc = PyWC(self.dic, self.output)
        self.pywc.tuning = True
        self.data = {}
        self.detailed_data = {}

    def save(self):
        if self._skip_revision:
            return
        if self._text is None:  # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)
        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            self.pywc.parse_col(diff)

            unique = len(self.pywc._unique)
            inserted_words = self.pywc._total
            if inserted_words > 1000 or unique < inserted_words / 10:
                logging.warn("Vandalism detected! (%s)", self._date)
                self._prev_text = self._text
                return

            if not self._type in self.data:
                self.data[self._type] = {}
            current = self.data[self._type]
            date_str = self._date.strftime("%Y/%m/%d")
            tmp = {"date": date_str,
                   "qmarks": self.pywc._qmarks,
                   "unique": unique,
                   "dic": self.pywc._dic,
                   "sixltr": self.pywc._sixltr,
                   "total": self.pywc._total}
            for x in self.pywc.categories:
                tmp[x] = self.pywc._results[x]

            if not date_str in current:
                current[date_str] = tmp
                current[date_str]["edits"] = 1
            else:
                for elem in tmp:
                    if elem != "date":
                        current[date_str][elem] += tmp[elem]
                current[date_str]["edits"] += 1
            del tmp

            if self.pywc.detailed and self._type == self.detailed_ns:
                date_str = self._date.strftime("%Y/%m/%d")
                if not date_str in self.detailed_data:
                    self.detailed_data[date_str] = defaultdict(dict)
                for keyword in self.pywc._detailed_data:
                    occ = self.pywc._detailed_data[keyword]
                    tmp = self.detailed_data[date_str][keyword]
                    if not tmp:
                        tmp = {}
                        tmp["total"] = 0
                        tmp["pages"] = Counter()
                        tmp["users"] = Counter()
                    tmp["total"] += occ
                    tmp["pages"][self._title] += occ
                    tmp["users"][self._sender] += occ
                    self.detailed_data[date_str][keyword] = tmp
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text

    def flush(self):
        for ns in self.data:
            for date in sorted(self.data[ns]):
                tmp = {"ns": ns, "date": date}
                tmp.update(self.data[ns][date])
                self.pywc.csv_writer.writerow(tmp)
        for date in self.detailed_data:
            filename = "%s_detailed_%s" % (self.output.name,
                                           date.replace("/", ""))
            with open(filename, "w") as f:
                detailed_csv = csv.writer(f, delimiter="\t")
                for keyword in self.detailed_data[date]:
                    current = self.detailed_data[date][keyword]
                    top_pages = sorted(current["pages"].items(),
                                       key=itemgetter(1))[:20]
                    top_users = sorted(current["users"].items(),
                                       key=itemgetter(1))[:20]
                    tmp = [keyword, current["total"], top_pages,
                           len(current["pages"]), top_users,
                           len(current["users"])]
                    detailed_csv.writerow(tmp)

    def process_page(self, _):
        self.count += 1
        if not self.count % 1000:
            logging.info(' ### Processed %d pages', self.count)
        self.delattr(("text"))
        self._skip = False

    def process_redirect(self, _):
        self._skip = True
        pass

    def process_title(self, elem):
        self.delattr(("_counter", "_type", "_title", "_skip", "_date",
                      "text", "_username", "_ip"))
        if self._skip_revision:
            return
        self._skip = False
        print elem.text
        self._title = smart_str(elem.text)
        a_title = self._title.split(':')
        if len(a_title) == 1:
            self._type = "Normal"
        else:
            self._type = a_title[0] if a_title[0] in self.namespaces \
                                    else "Normal"

    def process_timestamp(self, elem):
        if self._skip_revision:
            return
        revision_time = mwlib.ts2dt(elem.text)
        if ((self.detailed_end and revision_time > self.detailed_end) or
            (self.detailed_start and revision_time < self.detailed_start)):
            self._skip_revision = True
        else:
            self._date = revision_time
        del revision_time

    def process_contributor(self, contributor):
        if self._skip_revision:
            return

        if contributor is None:
            self._skip_revision = True
        self._sender = self._username or self._ip
        self.delattr(("_username", "_ip"))
        if not self._sender:
            self._skip_revision = True

    def process_revision(self, _):
        skip = self._skip_revision
        self._skip_revision = False
        if skip:
            return
        self.delattr(("_username", "_ip", "_date"))
        del skip

    def process_username(self, elem):
        if self._skip_revision:
            return
        self._username = elem.text

    def process_ip(self, elem):
        if self._skip_revision:
            return
        self._ip = elem.text
class PyWCProcessor(HistoryRevisionsPageProcessor):
    pywc = None
    namespaces = None
    data = None
    dic = None

    def __init__(self, **kwargs):
        super(PyWCProcessor, self).__init__(**kwargs)
        self.dic = kwargs["dic"]
        self.pywc = PyWC(self.dic, self.output)
        self.pywc.tuning = True
        self.data = {}

    def save(self):
        if self._text is None: # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)
        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            self.pywc.parse_col(diff)
            if not self.data.has_key(self._type):
                self.data[self._type] = {}
            current = self.data[self._type]
            date = mwlib.ts2dt(self._date)
            date_str = date.strftime("%Y/%m/%d")
            tmp = {"date": date_str,
                   "qmarks": self.pywc._qmarks,
                   "unique": len(self.pywc._unique),
                   "dic": self.pywc._dic,
                   "sixltr": self.pywc._sixltr,
                   "total": self.pywc._total}
            for x in self.pywc.categories:
                tmp[x] = self.pywc._results[x]

            if not current.has_key(date_str):
                current[date_str] = tmp
                current[date_str]["edits"] = 1
            else:
                for elem in tmp:
                    if elem != "date":
                        current[date_str][elem] += tmp[elem]
                current[date_str]["edits"] += 1
            del tmp
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text

    def flush(self):
        for line in self.data:
            for date in sorted(self.data[line]):
                tmp = {"ns": line, "date": date}
                tmp.update(self.data[line][date])
                self.pywc.csv_writer.writerow(tmp)

    def process_page(self, _):
        self.count += 1
        if not self.count % 1000:
            logging.info(' ### Processed %d pages', self.count)
        self.delattr(("text"))
        self._skip = False

    def process_redirect(self, _):
        self._skip = True
        pass

    def process_title(self, elem):
        self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text"))
        self._skip = False
        print elem.text
        elem.text = smart_str(elem.text)
        a_title = elem.text.split(':')
        if len(a_title) == 1:
            self._type = "Normal"
        else:
            self._type = a_title[0] if a_title[0] in self.namespaces \
                                    else "Normal"