Esempio n. 1
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] dic input_file output_file")
    p.add_option('-l',
                 '--lang',
                 action="store",
                 dest="lang",
                 default="en",
                 help="Wikipedia language")
    p.add_option('-n',
                 '--edits',
                 action="store",
                 dest="edits",
                 type=int,
                 default=500,
                 help="Edit number to consider")
    opts, files = p.parse_args()
    if len(files) != 3:
        p.error("Wrong parameters")
    logging.basicConfig(stream=sys.stderr,
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    csv_reader = csv.reader(open(files[1], "r"))

    textcleaner = TextCleaner()
    pywc = PyWC()
    pywc.set_dic(files[0])

    try:
        cat_names = [
            str(x[1])
            for x in sorted([(int(a), b) for a, b in pywc.categories.items()])
        ]
    except ValueError:
        cat_names = [str(x[1]) for x in sorted(pywc.categories.items())]

    reverse_categories = {}
    for key, value in pywc.categories.iteritems():
        reverse_categories[value] = key

    arcsin_fields = ["%s_arcsin" % key for key in cat_names]

    fields = ["title", "total_edits", "unique_editors", "traumatic",
              "non_traumatic", "natural", "human", "len", "len_cleaned"] + \
             cat_names + arcsin_fields + \
             ["qmarks", "unique", "dic", "sixltr", "total"]

    csv_writer = csv.DictWriter(open(files[2], "w"), fields)

    csv_writer.writeheader()

    for line in csv_reader:
        title, rev = line[0], opts.edits - 1
        revision_id = find_revision_id(title, rev, opts.lang, startid=None)
        if revision_id is None:
            continue
        rev = get_revision(revision_id, opts.lang)

        cleaned_rev = textcleaner.clean_all(rev)
        cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True)

        pywc.parse_col(cleaned_rev)

        result = {
            "title": title,
            "total_edits": line[1],
            "unique_editors": line[2],
            "traumatic": line[3],
            "non_traumatic": line[4],
            "natural": line[5],
            "human": line[6],
            "len": len(rev.split()),
            "len_cleaned": len(cleaned_rev.split()),
            "qmarks": pywc._qmarks,
            "unique": len(pywc._unique),
            "dic": pywc._dic,
            "sixltr": pywc._sixltr,
            "total": pywc._total,
        }

        for key, val in reverse_categories.iteritems():
            score = perc(pywc._results[val], pywc._total) * 100
            arcsin = calc_arcsin(score)
            result[key] = score  # percentage results
            result["%s_arcsin" % key] = arcsin  # arcsin results

        csv_writer.writerow(result)
Esempio n. 2
0
class HistoryRevisionsPageProcessor(HistoryPageProcessor):
    output = None
    queue = None
    _skip = None
    _prev_text = ""
    _text = None
    get_talks = True
    get_articles = True
    diff_timeout = 0.5
    clean = None
    textcleaner = None

    def __init__(self, **kwargs):
        super(HistoryRevisionsPageProcessor, self).__init__(**kwargs)
        self.textcleaner = TextCleaner(userns)
        self.queue = []
        f = open(self.output, 'w')
        self._keys = ["timestamp", "lang", "title", "type", "text"]
        self.csv_writer = csv.DictWriter(f, fieldnames=self._keys,
                                         delimiter='\t', quotechar='"',
                                         quoting=csv.QUOTE_ALL)

    def flush(self):
        """
        Flushes queue in the CSV output
        """
        pages = [{'title': page['title'],
                  'lang': self.lang,
                  'timestamp': page['timestamp'],
                  'text': page['text'],
                  'type': page['type']} for page in self.queue]
        self.csv_writer.writerows(pages)
        self.queue = []

    def save(self):
        """
        Saves data to the queue.
        The queue is stored using self.flush()
        """
        if self._text is None: # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)
        page = {'title': smart_str(self._title),
                'lang': self.lang,
                'timestamp': self._date,
                'text': smart_str(_diff_text(self._prev_text,
                                             self._text,
                                             timeout=self.diff_timeout)[0]),
                'type': self._type}
        self.queue.append(page)
        self._prev_text = self._text

    def process_title(self, elem):
        self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text"))
        self._skip = False
        a_title = elem.text.split(':')

        if len(a_title) == 1 and self.get_articles:
            self._type = 'normal'
            self._title = a_title[0]
        elif len(a_title) == 2 and a_title[0] == self.talkns and \
             self.get_talks:
            self._type = 'talk'
            self._title = a_title[1]
        else:
            self._skip = True

        if not self._skip:
            self._desired = self.is_desired(self._title)
            if not self._desired:
                self._skip = True
            else:
                logging.info('Start processing desired page %s (%s)',
                             self._title, self._type)

    def process_timestamp(self, elem):
        if self._skip:
            return
        self._date = elem.text

    def process_text(self, elem):
        if self._skip:
            return
        self._text = elem.text
        self.save()

    def process_page(self, _):
        self.count += 1
        if not self.count % 1000:
            logging.info(' ### Processed %d pages', self.count)
        self.delattr(("text"))
        if not self._skip:
            with Timr('Flushing %s' % self._title):
                self.flush()
        self._skip = False

    def process_redirect(self, _):
        # This class only considers pages that are in the desired file,
        # these pages must not be redirects
        self._skip = True
        raise ValueError("The page %s is a redirect. " % self._title + \
                         "Pages in the desired list must not be redirects.")
Esempio n. 3
0
class HistoryRevisionsPageProcessor(HistoryPageProcessor):
    output = None
    queue = None
    _skip = None
    _prev_text = ""
    _text = None
    get_talks = True
    get_articles = True
    diff_timeout = 0.5
    clean = None
    textcleaner = None
    rwords = re.compile(r"[\w']+")

    def __init__(self, **kwargs):
        super(HistoryRevisionsPageProcessor, self).__init__(**kwargs)
        self.textcleaner = TextCleaner(kwargs["userns"])
        self.queue = []
        fields = ["timestamp", "lang", "title", "type", "text"]
        self.csv_writer = csv.DictWriter(self.output,
                                         fieldnames=fields,
                                         delimiter='\t',
                                         quotechar='"',
                                         quoting=csv.QUOTE_ALL)

    def flush(self):
        """
        Flushes queue in the CSV output
        """
        pages = [{
            'title': page['title'],
            'lang': self.lang,
            'timestamp': page['timestamp'],
            'text': page['text'],
            'type': page['type']
        } for page in self.queue]
        self.csv_writer.writerows(pages)
        self.queue = []

    def save(self):
        """
        Saves data to the queue.
        The queue is stored using self.flush()
        """
        if self._text is None:  # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)

        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            page = {
                'title': smart_str(self._title),
                'lang': self.lang,
                'timestamp': self._date,
                'text': smart_str(diff),
                'type': self._type
            }
            self.queue.append(page)
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text

    def process_title(self, elem):
        self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text"))
        self._skip = False
        a_title = elem.text.split(':')

        if len(a_title) == 1 and self.get_articles:
            self._type = 'normal'
            self._title = a_title[0]
        elif len(a_title) == 2 and a_title[0] == self.talkns and \
             self.get_talks:
            self._type = 'talk'
            self._title = a_title[1]
        else:
            self._skip = True

        if not self._skip:
            self._desired = self.is_desired(self._title)
            if not self._desired:
                self._skip = True
            else:
                logging.info('Start processing desired page %s (%s)',
                             self._title, self._type)

    def process_timestamp(self, elem):
        if self._skip:
            return
        self._date = elem.text

    def process_text(self, elem):
        if self._skip:
            return
        self._text = elem.text
        self.save()

    def process_page(self, _):
        self.count += 1
        if not self.count % 1000:
            logging.info(' ### Processed %d pages', self.count)
        self.delattr(("text"))
        if not self._skip:
            with Timr('Flushing %s' % self._title):
                self.flush()
        self._skip = False

    def process_redirect(self, _):
        # This class only considers pages that are in the desired file,
        # these pages must not be redirects
        self._skip = True
        raise ValueError("The page %s is a redirect. " % self._title + \
                         "Pages in the desired list must not be redirects.")
Esempio n. 4
0
class HistoryRevisionsPageProcessor(HistoryPageProcessor):
    output = None
    queue = None
    _skip = None
    _prev_text = ""
    _text = None
    _username = None
    _ip = None
    _sender = None
    _skip_revision = False
    diff_timeout = 0.5
    clean = None
    textcleaner = None
    rwords = re.compile(r"[\w']+")

    def __init__(self, **kwargs):
        super(HistoryRevisionsPageProcessor, self).__init__(**kwargs)
        self.textcleaner = TextCleaner(kwargs["userns"])
        self.queue = []
        fields = ["timestamp", "lang", "title", "type", "user", "text"]
        self.csv_writer = csv.DictWriter(self.output, fieldnames=fields,
                                         delimiter='\t', quotechar='"',
                                         quoting=csv.QUOTE_ALL)

    def flush(self):
        """
        Flushes queue in the CSV output
        """
        pages = [{'title': page['title'],
                  'lang': self.lang,
                  'timestamp': page['timestamp'],
                  'text': page['text'],
                  'user': page['user'],
                  'type': page['type']} for page in self.queue]
        self.csv_writer.writerows(pages)
        self.queue = []

    def save(self):
        """
        Saves data to the queue.
        The queue is stored using self.flush()
        """
        if self._text is None: # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)

        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            page = {'title': smart_str(self._title),
                    'lang': self.lang,
                    'timestamp': self._date,
                    'text': smart_str(diff),
                    'user': smart_str(self._sender),
                    'type': self._type}
            self.queue.append(page)
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text

    def process_username(self, elem):
        if self._skip_revision:
            return
        self._username = elem.text

    def process_ip(self, elem):
        if self._skip_revision:
            return
        self._ip = elem.text

    def process_contributor(self, contributor):
        if self._skip_revision:
            return
        if contributor is None:
            self._skip_revision = True
        self._sender = self._username or self._ip
        self.delattr(("_username", "_ip"))
        if not self._sender:
            self.counter_deleted += 1
            self._skip_revision = True

    def process_title(self, elem):
        self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text",
                      "_username", "_ip", "_sender"))
        self._skip = False
        a_title = elem.text.split(':')

        if len(a_title) == 2 and a_title[0] == self.talkns:
            self._type = 'talk'
            self._title = a_title[1]
        elif len(a_title) == 2 and a_title[0] == self.usertalkns:
            self._type = 'user talk'
            self._title = a_title[1]
        elif len(a_title) == 1:
            self._type = 'normal'
            self._title = a_title[0]
        if not self._title:
            self._skip = True

        if not self._skip:
            if not ("talk" in self._type):
                self._skip = True
            else:
                logging.info('Start processing page %s (%s)',
                             self._title, self._type)

    def process_timestamp(self, elem):
        if self._skip:
            return
        self._date = elem.text

    def process_text(self, elem):
        if self._skip:
            return
        self._text = elem.text
        self.save()

    def process_page(self, _):
        self.count += 1
        if not self.count % 1000:
            logging.info(' ### Processed %d pages', self.count)
            if not self._skip:
                with Timr('Flushing %s' % self._title):
                    self.flush()
        self.delattr(("text"))
        self._skip = False

    def process_redirect(self, _):
        self._skip = True
Esempio n. 5
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] dic input_file output_file")
    p.add_option('-l', '--lang', action="store", dest="lang", default="en",
                 help="Wikipedia language")
    p.add_option('-n', '--edits', action="store", dest="edits", type=int,
                 default=500, help="Edit number to consider")
    opts, files = p.parse_args()
    if len(files) != 3:
        p.error("Wrong parameters")
    logging.basicConfig(stream=sys.stderr,
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    csv_reader = csv.reader(open(files[1], "r"))

    textcleaner = TextCleaner()
    pywc = PyWC()
    pywc.set_dic(files[0])

    try:
        cat_names = [str(x[1]) for x in sorted([(int(a), b) for a, b in
                     pywc.categories.items()])]
    except ValueError:
        cat_names = [str(x[1]) for x in sorted(pywc.categories.items())]

    reverse_categories = {}
    for key, value in pywc.categories.iteritems():
        reverse_categories[value] = key

    arcsin_fields = ["%s_arcsin" % key for key in cat_names]

    fields = ["title", "total_edits", "unique_editors", "traumatic",
              "non_traumatic", "natural", "human", "len", "len_cleaned"] + \
             cat_names + arcsin_fields + \
             ["qmarks", "unique", "dic", "sixltr", "total"]

    csv_writer = csv.DictWriter(open(files[2], "w"), fields)

    csv_writer.writeheader()

    for line in csv_reader:
        title, rev = line[0], opts.edits - 1
        revision_id = find_revision_id(title, rev, opts.lang, startid=None)
        if revision_id is None:
            continue
        rev = get_revision(revision_id, opts.lang)

        cleaned_rev = textcleaner.clean_all(rev)
        cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True)

        pywc.parse_col(cleaned_rev)

        result = {
            "title": title,
            "total_edits": line[1],
            "unique_editors": line[2],
            "traumatic": line[3],
            "non_traumatic": line[4],
            "natural": line[5],
            "human": line[6],
            "len": len(rev.split()),
            "len_cleaned": len(cleaned_rev.split()),
            "qmarks": pywc._qmarks,
            "unique": len(pywc._unique),
            "dic": pywc._dic,
            "sixltr": pywc._sixltr,
            "total": pywc._total,
        }

        for key, val in reverse_categories.iteritems():
            score = perc(pywc._results[val], pywc._total) * 100
            arcsin = calc_arcsin(score)
            result[key] = score  # percentage results
            result["%s_arcsin" % key] = arcsin  # arcsin results

        csv_writer.writerow(result)
Esempio n. 6
0
class HistoryRevisionsPageProcessor(HistoryPageProcessor):
    output = None
    queue = None
    _skip = None
    _prev_text = ""
    _text = None
    _username = None
    _ip = None
    _sender = None
    _skip_revision = False
    diff_timeout = 0.5
    clean = None
    textcleaner = None
    rwords = re.compile(r"[\w']+")

    def __init__(self, **kwargs):
        super(HistoryRevisionsPageProcessor, self).__init__(**kwargs)
        self.textcleaner = TextCleaner(kwargs["userns"])
        self.queue = []
        fields = ["timestamp", "lang", "title", "type", "user", "text"]
        self.csv_writer = csv.DictWriter(self.output,
                                         fieldnames=fields,
                                         delimiter='\t',
                                         quotechar='"',
                                         quoting=csv.QUOTE_ALL)

    def flush(self):
        """
        Flushes queue in the CSV output
        """
        pages = [{
            'title': page['title'],
            'lang': self.lang,
            'timestamp': page['timestamp'],
            'text': page['text'],
            'user': page['user'],
            'type': page['type']
        } for page in self.queue]
        self.csv_writer.writerows(pages)
        self.queue = []

    def save(self):
        """
        Saves data to the queue.
        The queue is stored using self.flush()
        """
        if self._text is None:  # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)

        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            page = {
                'title': smart_str(self._title),
                'lang': self.lang,
                'timestamp': self._date,
                'text': smart_str(diff),
                'user': smart_str(self._sender),
                'type': self._type
            }
            self.queue.append(page)
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text

    def process_username(self, elem):
        if self._skip_revision:
            return
        self._username = elem.text

    def process_ip(self, elem):
        if self._skip_revision:
            return
        self._ip = elem.text

    def process_contributor(self, contributor):
        if self._skip_revision:
            return
        if contributor is None:
            self._skip_revision = True
        self._sender = self._username or self._ip
        self.delattr(("_username", "_ip"))
        if not self._sender:
            self.counter_deleted += 1
            self._skip_revision = True

    def process_title(self, elem):
        self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text",
                      "_username", "_ip", "_sender"))
        self._skip = False
        a_title = elem.text.split(':')

        if len(a_title) == 2 and a_title[0] == self.talkns:
            self._type = 'talk'
            self._title = a_title[1]
        elif len(a_title) == 2 and a_title[0] == self.usertalkns:
            self._type = 'user talk'
            self._title = a_title[1]
        elif len(a_title) == 1:
            self._type = 'normal'
            self._title = a_title[0]
        if not self._title:
            self._skip = True

        if not self._skip:
            if not ("talk" in self._type):
                self._skip = True
            else:
                logging.info('Start processing page %s (%s)', self._title,
                             self._type)

    def process_timestamp(self, elem):
        if self._skip:
            return
        self._date = elem.text

    def process_text(self, elem):
        if self._skip:
            return
        self._text = elem.text
        self.save()

    def process_page(self, _):
        self.count += 1
        if not self.count % 1000:
            logging.info(' ### Processed %d pages', self.count)
            if not self._skip:
                with Timr('Flushing %s' % self._title):
                    self.flush()
        self.delattr(("text"))
        self._skip = False

    def process_redirect(self, _):
        self._skip = True