class TestTextCleaner(unittest.TestCase):

    def setUp(self):
        self.textcleaner = TextCleaner()

    def test_clean(self):
        t = (";D :E born in the U.S.A.! Yeah. A. :-D",
             "I feel sick today :S",
             ":My favourite TV series: The Big Bang Theory",
             "F.B.K.")
        e = ("born in the! Yeah. ",
             "I feel sick today ",
             ":My favourite TV series: The Big Bang Theory",
             "")
        for i, s in enumerate(t):
            self.assertEquals(self.textcleaner.clean_text(s), e[i])

    def test_clean_HTML(self):
        t = ("<div><b>42</b> is the <a href='#'>answer</a></div>",
             "<span>Hello World</span>",
             "<!-- I mustn't read this --> Are comments being filtered?",
             "I don't &amp; like HTML entities &dioji; LO&ppp;L")
        e = ("42 is the answer",
             "Hello World",
             " Are comments being filtered?",
             "I don't  like HTML entities  LOL")
        for i, s in enumerate(t):
            self.assertEquals(self.textcleaner.clean_html_syntax(s), e[i])

    def test_clean_wiki(self):
        t = ("Less taxes for everyone! {{citation needed}}",
             "look here http://google.it/a/lol.html lol :D http://wiki.com",
             "drink a relaxing [Jack Daniel's]",
             "If you want some [Wikipedia:Help] look here",
             "| name =goofy, |city =New York",
             "[File:Case di Volano.jpg|thumb|250px|Volano vista da un dosso]",
             "vicino a [[Calliano (Trentino-Alto Adige)|Calliano]] c'e' un",
             "[[nap:test:Volano (TN)]]",
             "andare in S.Marco",
             "[[Pagina|link fatto male poiche' manca una parentesi quadra " \
             "e c'e' caratteri strani dentro? ;)]",
             "[http://www.nps.gov/ Oklahoma City National Memorial] National")

        e = ("Less taxes for everyone! ",
             "look here  lol :D ",
             "drink a relaxing Jack Daniel's",
             "If you want some Help look here",
             "",
             "Volano vista da un dosso",
             "vicino a Calliano c'e' un",
             "Volano (TN)",
             "andare in S.Marco",
             "link fatto male poiche' manca una parentesi quadra " \
             "e c'e' caratteri strani dentro? ;)",
             " Oklahoma City National Memorial National")

        for i, s in enumerate(t):
            self.assertEquals(self.textcleaner.clean_wiki_syntax(s), e[i])
Exemple #2
0
class TestTextCleaner(unittest.TestCase):

    def setUp(self):
        self.textcleaner = TextCleaner()

    def test_clean(self):
        t = (";D :E born in the U.S.A.! Yeah. A. :-D",
             "I feel sick today :S",
             ":My favourite TV series: The Big Bang Theory",
             "F.B.K.")
        e = ("born in the! Yeah. ",
             "I feel sick today ",
             ":My favourite TV series: The Big Bang Theory",
             "")
        for i, s in enumerate(t):
            self.assertEquals(self.textcleaner.clean_text(s), e[i])

    def test_clean_HTML(self):
        t = ("<div><b>42</b> is the <a href='#'>answer</a></div>",
             "<span>Hello World</span>",
             "<!-- I mustn't read this --> Are comments being filtered?",
             "I don't &amp; like HTML entities &dioji; LO&ppp;L")
        e = ("42 is the answer",
             "Hello World",
             " Are comments being filtered?",
             "I don't  like HTML entities  LOL")
        for i, s in enumerate(t):
            self.assertEquals(self.textcleaner.clean_html_syntax(s), e[i])

    def test_clean_wiki(self):
        t = ("Less taxes for everyone! {{citation needed}}",
             "look here http://google.it/a/lol.html lol :D http://wiki.com",
             "drink a relaxing [Jack Daniel's]",
             "If you want some [Wikipedia:Help] look here",
             "| name =goofy, |city =New York",
             "[File:Case di Volano.jpg|thumb|250px|Volano vista da un dosso]",
             "vicino a [[Calliano (Trentino-Alto Adige)|Calliano]] c'e' un",
             "[[nap:test:Volano (TN)]]",
             "andare in S.Marco",
             "[[Pagina|link fatto male poiche' manca una parentesi quadra " \
             "e c'e' caratteri strani dentro? ;)]",
             "[http://www.nps.gov/ Oklahoma City National Memorial] National")

        e = ("Less taxes for everyone! ",
             "look here  lol :D ",
             "drink a relaxing Jack Daniel's",
             "If you want some Help look here",
             "",
             "Volano vista da un dosso",
             "vicino a Calliano c'e' un",
             "Volano (TN)",
             "andare in S.Marco",
             "link fatto male poiche' manca una parentesi quadra " \
             "e c'e' caratteri strani dentro? ;)",
             " Oklahoma City National Memorial National")

        for i, s in enumerate(t):
            self.assertEquals(self.textcleaner.clean_wiki_syntax(s), e[i])
Exemple #3
0
 def __init__(self, **kwargs):
     super(HistoryRevisionsPageProcessor, self).__init__(**kwargs)
     self.textcleaner = TextCleaner(kwargs["userns"])
     self.queue = []
     fields = ["timestamp", "lang", "title", "type", "text"]
     self.csv_writer = csv.DictWriter(self.output,
                                      fieldnames=fields,
                                      delimiter='\t',
                                      quotechar='"',
                                      quoting=csv.QUOTE_ALL)
 def __init__(self, **kwargs):
     super(HistoryRevisionsPageProcessor, self).__init__(**kwargs)
     self.textcleaner = TextCleaner(kwargs["userns"])
     self.queue = []
     fields = ["timestamp", "lang", "title", "type", "user", "text"]
     self.csv_writer = csv.DictWriter(self.output, fieldnames=fields,
                                      delimiter='\t', quotechar='"',
                                      quoting=csv.QUOTE_ALL)
Exemple #5
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] page_title output_file")
    p.add_option('-l', '--lang', action="store", dest="lang", default="en",
                 help="Wikipedia language")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 help="Clean wiki syntax / HTML")
    opts, files = p.parse_args()
    if len(files) != 2:
        p.error("Wrong parameters")
    logging.basicConfig(stream=sys.stderr,
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    csv_writer = csv.writer(open(files[1], "w"),
                 delimiter="\t",
                 quotechar='"',
                 quoting=csv.QUOTE_ALL)
    textcleaner = None
    if opts.clean:
        textcleaner = TextCleaner()
    get_revisions(files[0], csv_writer, opts.lang, textcleaner)
Exemple #6
0
class PyWC:
    """
    PyWC is a python class for word counting and text analisys.
    """
    # Global proprieties (of the whole source file)
    categories = None  # Dictionary's categories
    keywords = None  # Dictionary's keywords/regex
    delimiter = "\t"  # CSV delimiter
    quotechar = '"'  # CSV quotechar
    csv_out = sys.stdout  # CSV output
    queue = []  # Flushing queue
    max_char_limit = 100000  # Max chars per line
    ignorecols = []  # List of columns of the src file to ignore
    csv_writer = None  # Csv writer handler
    id_col = 0  # Number of the id column
    dic_regex = False  # Use dictionary made of regex
    flush_n = 100  # Number of pieces of text to store
    clean_wiki = None  # Clean wiki syntax
    clean_html = None  # Clean HTML
    percentage = False  # Output as percentage
    tuning = False  # Set tuning mode (no conditional dictionary but
    # a lot faster!)
    detailed = False  # detailed PYWC output per keyword

    rwords = re.compile(r"[\w']+")
    rqmarks = re.compile(r"\?")

    textcleaner = TextCleaner()

    cond_exp_regex = (re.compile(r"<([\w']+)>(\w+)(\/(\w+)?)?"),
                      re.compile(r"\(([\w\s]+)\)(\w+)(\/(\w+)?)?"))

    # Local proprieties (of every column of the source file)
    _id = None  # Line ID
    _results = None  # Dictionary where keys are cat ids and
    # values are counters
    _qmarks = None  # Number of question marks
    _unique = None  # Set of unique words, len() of the set is the number
    # of unique words
    _dic = None  # Number of words in dic
    _sixltr = None  # Number of words > 6 letters
    _total = None  # Number of total words per column
    _text = None  # Current text to analize
    _next_word = None  # Next word that has to be analized
    _prev_cat = None  # Categories of the last word that has been analized
    # (useful for conditional exps)
    _counter = 0  # Generic counter of how many pieces of
    # text have been analized
    _keys = None
    _detailed_data = None  # data only for detailed output

    def __init__(self, **kwargs):
        self.__dict__ = kwargs

    def delattrs(self, attrs):
        """
        Frees memory deleting useless attributes of the object
        """
        for attr in attrs:
            try:
                delattr(self, attr)
            except AttributeError:
                pass

    def _gen_keyword(self, content):
        """
        Generator for self.keywords (dictionary made of of regexps
        as keys and thier categories as values)
        """
        for line in content[2].split("\n")[1:-1]:
            # Comments start with //
            if line and not line.startswith("//"):
                line = line.split("\t")
                # If not using a dictionary made of regexps
                # it fixes the keyword for regexping
                # "^" is added at the beginning of every keyword
                # If keyword doesn't ends with "*", a "$" is added
                # bad -> ^bad$ matches "bad" but not "badass"
                # bad* -> ^bad matches "bad" and "badass"
                if not self.dic_regex:
                    line[0] = "".join(["\\b", line[0]])
                    try:
                        if (line[0][-1] == "*"):
                            line[0] = line[0][:-1]
                        else:
                            line[0] = "".join([line[0], "\\b"])
                    except IndexError:
                        continue
                yield (re.compile(line[0], re.IGNORECASE), line[1:])

    def set_dic(self, dic):
        """
        Receives as input the dictionary filename.
        Reads the dictionary file and populates self.categories and
        self.keywords
        """
        f = open(dic, 'r')
        content = f.read()
        content = content.split("%")
        if len(content) != 3:
            raise ValueError("Invalid dic file")

        # Creates a dictionary where category ids are the keys
        # and category names are the values.
        # Splits content at first by new line, then by tab
        self.categories = dict((line.split("\t") \
                for line in content[1].split("\n")[1:-1] if line))

        # Creates a dictionary where the compiled regex is the key
        # and category ids are the values
        self.keywords = dict(x for x in self._gen_keyword(content))

    def flush(self):
        """
        Writes everything which is in the queue in the csv output file
        """
        self.csv_writer.writerows(self.queue)
        self.queue = []

    def save(self):
        """
        Saves current piece of text that has been analized to the queue
        """
        tmp = {
            "id": self._id,
            "qmarks": perc(self._qmarks, self._total, self.percentage),
            "unique": perc(len(self._unique), self._total, self.percentage),
            "dic": perc(self._dic, self._total, self.percentage),
            "sixltr": perc(self._sixltr, self._total, self.percentage),
            "total": self._total,
            "text": self._text
        }
        # Join of self.categories and self._results values
        for k, v in ((self.categories[x], \
                      perc(self._results[x], self._total, self.percentage)) \
                    for x in self.categories):
            tmp[k] = v
        self.queue.append(tmp)
        del tmp
        self._counter += 1
        if self._counter % self.flush_n == 0:
            logging.info("### Flushing: %d", self._counter)
            self.flush()

    def parse_word(self, word):
        """
        Parses a single word with the dictionary of regexps
        (self.keywords). For every regex that matches, it
        increments every category they belong to in self._result
        """
        if not self.tuning:
            cat = []
            for regex in self.keywords:
                if regex.search(word):
                    if self.detailed:
                        self._detailed_data[regex.pattern] += 1
                    for i in self.keywords[regex]:
                        res = self.cond_exp_regex[0].match(i)
                        if res:
                            if self._next_word == res.group(1):
                                cat.append(res.group(2))
                            elif res.group(4):
                                cat.append(res.group(4))
                            continue

                        res = self.cond_exp_regex[1].match(i)
                        if res:
                            if True in [c in self._prev_cat \
                                        for c in res.group(1).split(" ")]:
                                cat.append(res.group(2))
                            elif res.group(4):
                                cat.append(res.group(4))
                            continue

                        # If dictionary contains trailing tabs,
                        # '' keys are saved. It skips them.
                        if i:
                            cat.append(i)

            for c in cat:
                try:
                    self._results[c] += 1
                except KeyError:
                    logging.warn("Invalid category id %s", c)
            if len(cat) > 0:  # Increment word in dictionary counter
                self._dic += 1
            self._prev_cat = cat

        if len(word) > 6:  # Increment word > 6 letters counter
            self._sixltr += 1
        self._total += 1
        self._unique.add(word)

    def parse_col(self, col):
        """
        Reads a single cell of the csv file. It splits it
        into words and gives them to self.parse_word
        """
        self.delattrs(
            ("_results", "_qmarks", "_unique", "_dic", "_sixltr", "_total",
             "_text", "_prev_word", "_prev cat", "_detailed_data"))
        self._text = col
        #logging.info("--------PRIMA-----------")
        #logging.info(self._text)
        #logging.info("-------------------")
        if self.clean_wiki or self.clean_html:
            self._text = self.textcleaner.clean_text(self._text)
        if self.clean_wiki:
            self._text = self.textcleaner.clean_wiki_syntax(self._text)
        if self.clean_html:
            self._text = self.textcleaner.clean_html_syntax(self._text)
        #logging.info("--------DOPO------------")
        #logging.info(self._text)
        #logging.info("-------------------")
        self._results = Counter()
        if self.detailed:
            self._detailed_data = Counter()
        self._qmarks = len([m for m in self.rqmarks.findall(self._text)])
        self._unique = set()
        self._dic = 0
        self._sixltr = 0
        self._total = 0
        # create a list of words (_no_ numbers)
        words = [word for word in self.rwords.findall(self._text) \
                 if not word.isdigit()]

        for i, word in enumerate(words):
            try:
                self._next_word = words[i + 1]
            except IndexError:
                self._next_word = ""
            self.parse_word(word)

        if self.tuning:
            for regex in self.keywords:
                occ = len(regex.findall(self._text))
                if occ:
                    for cat in self.keywords[regex]:
                        if cat:
                            try:
                                self._results[cat] += occ
                            except KeyError:
                                logging.warn("Invalid category id %s", cat)
                    if self.detailed:
                        self._detailed_data[regex.pattern] += occ
                self._dic += occ

    def parse_line(self, line):
        """
        Reads a single line of the csv file.
        Sets self._id and gives the cells that are not in the ignore
        list to self.parse_col
        """
        self.delattrs(("_id"))
        self._id = line[self.id_col]
        for i, col in enumerate(line):
            if len(col) <= self.max_char_limit:
                if i != self.id_col and not i in self.ignorecols:
                    self.parse_col(col)
                    self.save()
            else:
                logging.warn(
                    " Line %d:%d skipped "
                    "because longer than %d chars", self._counter, i,
                    self.max_char_limit)

    def start(self, src):
        """
        It starts the file processing.
        To obtain a sensible output is recommended to run self.set_dic()
        before.
        It writes the output csv header and reads every line, passing
        it to self.parse_line
        """

        # Creates a list of category names sorted by their ID.
        # Useful because Python dictionaries are not sorted objects!
        # Sorting like TAWC
        try:
            cat_names = [x[1] for x in sorted([(int(a), b) for a, b in \
                                               self.categories.items()])]
        except ValueError:
            cat_names = [x[1] for x in sorted(self.categories.items())]

        self._keys = ["id"] + cat_names + [
            "qmarks", "unique", "dic", "sixltr", "total", "text"
        ]
        self.csv_writer = csv.DictWriter(self.csv_out,
                                         delimiter=self.delimiter,
                                         fieldnames=self._keys,
                                         quotechar=self.quotechar)
        self.csv_writer.writeheader()
        csv_reader = csv.reader(src, delimiter=self.delimiter)
        for line in csv_reader:
            self.parse_line(line)
Exemple #7
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] dic input_file output_file")
    p.add_option('-l',
                 '--lang',
                 action="store",
                 dest="lang",
                 default="en",
                 help="Wikipedia language")
    p.add_option('-n',
                 '--edits',
                 action="store",
                 dest="edits",
                 type=int,
                 default=500,
                 help="Edit number to consider")
    opts, files = p.parse_args()
    if len(files) != 3:
        p.error("Wrong parameters")
    logging.basicConfig(stream=sys.stderr,
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    csv_reader = csv.reader(open(files[1], "r"))

    textcleaner = TextCleaner()
    pywc = PyWC()
    pywc.set_dic(files[0])

    try:
        cat_names = [
            str(x[1])
            for x in sorted([(int(a), b) for a, b in pywc.categories.items()])
        ]
    except ValueError:
        cat_names = [str(x[1]) for x in sorted(pywc.categories.items())]

    reverse_categories = {}
    for key, value in pywc.categories.iteritems():
        reverse_categories[value] = key

    arcsin_fields = ["%s_arcsin" % key for key in cat_names]

    fields = ["title", "total_edits", "unique_editors", "traumatic",
              "non_traumatic", "natural", "human", "len", "len_cleaned"] + \
             cat_names + arcsin_fields + \
             ["qmarks", "unique", "dic", "sixltr", "total"]

    csv_writer = csv.DictWriter(open(files[2], "w"), fields)

    csv_writer.writeheader()

    for line in csv_reader:
        title, rev = line[0], opts.edits - 1
        revision_id = find_revision_id(title, rev, opts.lang, startid=None)
        if revision_id is None:
            continue
        rev = get_revision(revision_id, opts.lang)

        cleaned_rev = textcleaner.clean_all(rev)
        cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True)

        pywc.parse_col(cleaned_rev)

        result = {
            "title": title,
            "total_edits": line[1],
            "unique_editors": line[2],
            "traumatic": line[3],
            "non_traumatic": line[4],
            "natural": line[5],
            "human": line[6],
            "len": len(rev.split()),
            "len_cleaned": len(cleaned_rev.split()),
            "qmarks": pywc._qmarks,
            "unique": len(pywc._unique),
            "dic": pywc._dic,
            "sixltr": pywc._sixltr,
            "total": pywc._total,
        }

        for key, val in reverse_categories.iteritems():
            score = perc(pywc._results[val], pywc._total) * 100
            arcsin = calc_arcsin(score)
            result[key] = score  # percentage results
            result["%s_arcsin" % key] = arcsin  # arcsin results

        csv_writer.writerow(result)
class HistoryRevisionsPageProcessor(HistoryPageProcessor):
    output = None
    queue = None
    _skip = None
    _prev_text = ""
    _text = None
    get_talks = True
    get_articles = True
    diff_timeout = 0.5
    clean = None
    textcleaner = None

    def __init__(self, **kwargs):
        super(HistoryRevisionsPageProcessor, self).__init__(**kwargs)
        self.textcleaner = TextCleaner(userns)
        self.queue = []
        f = open(self.output, 'w')
        self._keys = ["timestamp", "lang", "title", "type", "text"]
        self.csv_writer = csv.DictWriter(f, fieldnames=self._keys,
                                         delimiter='\t', quotechar='"',
                                         quoting=csv.QUOTE_ALL)

    def flush(self):
        """
        Flushes queue in the CSV output
        """
        pages = [{'title': page['title'],
                  'lang': self.lang,
                  'timestamp': page['timestamp'],
                  'text': page['text'],
                  'type': page['type']} for page in self.queue]
        self.csv_writer.writerows(pages)
        self.queue = []

    def save(self):
        """
        Saves data to the queue.
        The queue is stored using self.flush()
        """
        if self._text is None: # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)
        page = {'title': smart_str(self._title),
                'lang': self.lang,
                'timestamp': self._date,
                'text': smart_str(_diff_text(self._prev_text,
                                             self._text,
                                             timeout=self.diff_timeout)[0]),
                'type': self._type}
        self.queue.append(page)
        self._prev_text = self._text

    def process_title(self, elem):
        self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text"))
        self._skip = False
        a_title = elem.text.split(':')

        if len(a_title) == 1 and self.get_articles:
            self._type = 'normal'
            self._title = a_title[0]
        elif len(a_title) == 2 and a_title[0] == self.talkns and \
             self.get_talks:
            self._type = 'talk'
            self._title = a_title[1]
        else:
            self._skip = True

        if not self._skip:
            self._desired = self.is_desired(self._title)
            if not self._desired:
                self._skip = True
            else:
                logging.info('Start processing desired page %s (%s)',
                             self._title, self._type)

    def process_timestamp(self, elem):
        if self._skip:
            return
        self._date = elem.text

    def process_text(self, elem):
        if self._skip:
            return
        self._text = elem.text
        self.save()

    def process_page(self, _):
        self.count += 1
        if not self.count % 1000:
            logging.info(' ### Processed %d pages', self.count)
        self.delattr(("text"))
        if not self._skip:
            with Timr('Flushing %s' % self._title):
                self.flush()
        self._skip = False

    def process_redirect(self, _):
        # This class only considers pages that are in the desired file,
        # these pages must not be redirects
        self._skip = True
        raise ValueError("The page %s is a redirect. " % self._title + \
                         "Pages in the desired list must not be redirects.")
Exemple #9
0
class HistoryRevisionsPageProcessor(HistoryPageProcessor):
    output = None
    queue = None
    _skip = None
    _prev_text = ""
    _text = None
    get_talks = True
    get_articles = True
    diff_timeout = 0.5
    clean = None
    textcleaner = None
    rwords = re.compile(r"[\w']+")

    def __init__(self, **kwargs):
        super(HistoryRevisionsPageProcessor, self).__init__(**kwargs)
        self.textcleaner = TextCleaner(kwargs["userns"])
        self.queue = []
        fields = ["timestamp", "lang", "title", "type", "text"]
        self.csv_writer = csv.DictWriter(self.output,
                                         fieldnames=fields,
                                         delimiter='\t',
                                         quotechar='"',
                                         quoting=csv.QUOTE_ALL)

    def flush(self):
        """
        Flushes queue in the CSV output
        """
        pages = [{
            'title': page['title'],
            'lang': self.lang,
            'timestamp': page['timestamp'],
            'text': page['text'],
            'type': page['type']
        } for page in self.queue]
        self.csv_writer.writerows(pages)
        self.queue = []

    def save(self):
        """
        Saves data to the queue.
        The queue is stored using self.flush()
        """
        if self._text is None:  # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)

        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            page = {
                'title': smart_str(self._title),
                'lang': self.lang,
                'timestamp': self._date,
                'text': smart_str(diff),
                'type': self._type
            }
            self.queue.append(page)
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text

    def process_title(self, elem):
        self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text"))
        self._skip = False
        a_title = elem.text.split(':')

        if len(a_title) == 1 and self.get_articles:
            self._type = 'normal'
            self._title = a_title[0]
        elif len(a_title) == 2 and a_title[0] == self.talkns and \
             self.get_talks:
            self._type = 'talk'
            self._title = a_title[1]
        else:
            self._skip = True

        if not self._skip:
            self._desired = self.is_desired(self._title)
            if not self._desired:
                self._skip = True
            else:
                logging.info('Start processing desired page %s (%s)',
                             self._title, self._type)

    def process_timestamp(self, elem):
        if self._skip:
            return
        self._date = elem.text

    def process_text(self, elem):
        if self._skip:
            return
        self._text = elem.text
        self.save()

    def process_page(self, _):
        self.count += 1
        if not self.count % 1000:
            logging.info(' ### Processed %d pages', self.count)
        self.delattr(("text"))
        if not self._skip:
            with Timr('Flushing %s' % self._title):
                self.flush()
        self._skip = False

    def process_redirect(self, _):
        # This class only considers pages that are in the desired file,
        # these pages must not be redirects
        self._skip = True
        raise ValueError("The page %s is a redirect. " % self._title + \
                         "Pages in the desired list must not be redirects.")
class HistoryRevisionsPageProcessor(HistoryPageProcessor):
    output = None
    queue = None
    _skip = None
    _prev_text = ""
    _text = None
    _username = None
    _ip = None
    _sender = None
    _skip_revision = False
    diff_timeout = 0.5
    clean = None
    textcleaner = None
    rwords = re.compile(r"[\w']+")

    def __init__(self, **kwargs):
        super(HistoryRevisionsPageProcessor, self).__init__(**kwargs)
        self.textcleaner = TextCleaner(kwargs["userns"])
        self.queue = []
        fields = ["timestamp", "lang", "title", "type", "user", "text"]
        self.csv_writer = csv.DictWriter(self.output, fieldnames=fields,
                                         delimiter='\t', quotechar='"',
                                         quoting=csv.QUOTE_ALL)

    def flush(self):
        """
        Flushes queue in the CSV output
        """
        pages = [{'title': page['title'],
                  'lang': self.lang,
                  'timestamp': page['timestamp'],
                  'text': page['text'],
                  'user': page['user'],
                  'type': page['type']} for page in self.queue]
        self.csv_writer.writerows(pages)
        self.queue = []

    def save(self):
        """
        Saves data to the queue.
        The queue is stored using self.flush()
        """
        if self._text is None: # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)

        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            page = {'title': smart_str(self._title),
                    'lang': self.lang,
                    'timestamp': self._date,
                    'text': smart_str(diff),
                    'user': smart_str(self._sender),
                    'type': self._type}
            self.queue.append(page)
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text

    def process_username(self, elem):
        if self._skip_revision:
            return
        self._username = elem.text

    def process_ip(self, elem):
        if self._skip_revision:
            return
        self._ip = elem.text

    def process_contributor(self, contributor):
        if self._skip_revision:
            return
        if contributor is None:
            self._skip_revision = True
        self._sender = self._username or self._ip
        self.delattr(("_username", "_ip"))
        if not self._sender:
            self.counter_deleted += 1
            self._skip_revision = True

    def process_title(self, elem):
        self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text",
                      "_username", "_ip", "_sender"))
        self._skip = False
        a_title = elem.text.split(':')

        if len(a_title) == 2 and a_title[0] == self.talkns:
            self._type = 'talk'
            self._title = a_title[1]
        elif len(a_title) == 2 and a_title[0] == self.usertalkns:
            self._type = 'user talk'
            self._title = a_title[1]
        elif len(a_title) == 1:
            self._type = 'normal'
            self._title = a_title[0]
        if not self._title:
            self._skip = True

        if not self._skip:
            if not ("talk" in self._type):
                self._skip = True
            else:
                logging.info('Start processing page %s (%s)',
                             self._title, self._type)

    def process_timestamp(self, elem):
        if self._skip:
            return
        self._date = elem.text

    def process_text(self, elem):
        if self._skip:
            return
        self._text = elem.text
        self.save()

    def process_page(self, _):
        self.count += 1
        if not self.count % 1000:
            logging.info(' ### Processed %d pages', self.count)
            if not self._skip:
                with Timr('Flushing %s' % self._title):
                    self.flush()
        self.delattr(("text"))
        self._skip = False

    def process_redirect(self, _):
        self._skip = True
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] dic input_file output_file")
    p.add_option('-l', '--lang', action="store", dest="lang", default="en",
                 help="Wikipedia language")
    p.add_option('-n', '--edits', action="store", dest="edits", type=int,
                 default=500, help="Edit number to consider")
    opts, files = p.parse_args()
    if len(files) != 3:
        p.error("Wrong parameters")
    logging.basicConfig(stream=sys.stderr,
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    csv_reader = csv.reader(open(files[1], "r"))

    textcleaner = TextCleaner()
    pywc = PyWC()
    pywc.set_dic(files[0])

    try:
        cat_names = [str(x[1]) for x in sorted([(int(a), b) for a, b in
                     pywc.categories.items()])]
    except ValueError:
        cat_names = [str(x[1]) for x in sorted(pywc.categories.items())]

    reverse_categories = {}
    for key, value in pywc.categories.iteritems():
        reverse_categories[value] = key

    arcsin_fields = ["%s_arcsin" % key for key in cat_names]

    fields = ["title", "total_edits", "unique_editors", "traumatic",
              "non_traumatic", "natural", "human", "len", "len_cleaned"] + \
             cat_names + arcsin_fields + \
             ["qmarks", "unique", "dic", "sixltr", "total"]

    csv_writer = csv.DictWriter(open(files[2], "w"), fields)

    csv_writer.writeheader()

    for line in csv_reader:
        title, rev = line[0], opts.edits - 1
        revision_id = find_revision_id(title, rev, opts.lang, startid=None)
        if revision_id is None:
            continue
        rev = get_revision(revision_id, opts.lang)

        cleaned_rev = textcleaner.clean_all(rev)
        cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True)

        pywc.parse_col(cleaned_rev)

        result = {
            "title": title,
            "total_edits": line[1],
            "unique_editors": line[2],
            "traumatic": line[3],
            "non_traumatic": line[4],
            "natural": line[5],
            "human": line[6],
            "len": len(rev.split()),
            "len_cleaned": len(cleaned_rev.split()),
            "qmarks": pywc._qmarks,
            "unique": len(pywc._unique),
            "dic": pywc._dic,
            "sixltr": pywc._sixltr,
            "total": pywc._total,
        }

        for key, val in reverse_categories.iteritems():
            score = perc(pywc._results[val], pywc._total) * 100
            arcsin = calc_arcsin(score)
            result[key] = score  # percentage results
            result["%s_arcsin" % key] = arcsin  # arcsin results

        csv_writer.writerow(result)
 def setUp(self):
     self.textcleaner = TextCleaner()
Exemple #13
0
class HistoryRevisionsPageProcessor(HistoryPageProcessor):
    output = None
    queue = None
    _skip = None
    _prev_text = ""
    _text = None
    _username = None
    _ip = None
    _sender = None
    _skip_revision = False
    diff_timeout = 0.5
    clean = None
    textcleaner = None
    rwords = re.compile(r"[\w']+")

    def __init__(self, **kwargs):
        super(HistoryRevisionsPageProcessor, self).__init__(**kwargs)
        self.textcleaner = TextCleaner(kwargs["userns"])
        self.queue = []
        fields = ["timestamp", "lang", "title", "type", "user", "text"]
        self.csv_writer = csv.DictWriter(self.output,
                                         fieldnames=fields,
                                         delimiter='\t',
                                         quotechar='"',
                                         quoting=csv.QUOTE_ALL)

    def flush(self):
        """
        Flushes queue in the CSV output
        """
        pages = [{
            'title': page['title'],
            'lang': self.lang,
            'timestamp': page['timestamp'],
            'text': page['text'],
            'user': page['user'],
            'type': page['type']
        } for page in self.queue]
        self.csv_writer.writerows(pages)
        self.queue = []

    def save(self):
        """
        Saves data to the queue.
        The queue is stored using self.flush()
        """
        if self._text is None:  # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)

        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            page = {
                'title': smart_str(self._title),
                'lang': self.lang,
                'timestamp': self._date,
                'text': smart_str(diff),
                'user': smart_str(self._sender),
                'type': self._type
            }
            self.queue.append(page)
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text

    def process_username(self, elem):
        if self._skip_revision:
            return
        self._username = elem.text

    def process_ip(self, elem):
        if self._skip_revision:
            return
        self._ip = elem.text

    def process_contributor(self, contributor):
        if self._skip_revision:
            return
        if contributor is None:
            self._skip_revision = True
        self._sender = self._username or self._ip
        self.delattr(("_username", "_ip"))
        if not self._sender:
            self.counter_deleted += 1
            self._skip_revision = True

    def process_title(self, elem):
        self.delattr(("_counter", "_type", "_title", "_skip", "_date", "text",
                      "_username", "_ip", "_sender"))
        self._skip = False
        a_title = elem.text.split(':')

        if len(a_title) == 2 and a_title[0] == self.talkns:
            self._type = 'talk'
            self._title = a_title[1]
        elif len(a_title) == 2 and a_title[0] == self.usertalkns:
            self._type = 'user talk'
            self._title = a_title[1]
        elif len(a_title) == 1:
            self._type = 'normal'
            self._title = a_title[0]
        if not self._title:
            self._skip = True

        if not self._skip:
            if not ("talk" in self._type):
                self._skip = True
            else:
                logging.info('Start processing page %s (%s)', self._title,
                             self._type)

    def process_timestamp(self, elem):
        if self._skip:
            return
        self._date = elem.text

    def process_text(self, elem):
        if self._skip:
            return
        self._text = elem.text
        self.save()

    def process_page(self, _):
        self.count += 1
        if not self.count % 1000:
            logging.info(' ### Processed %d pages', self.count)
            if not self._skip:
                with Timr('Flushing %s' % self._title):
                    self.flush()
        self.delattr(("text"))
        self._skip = False

    def process_redirect(self, _):
        self._skip = True
Exemple #14
0
 def setUp(self):
     self.textcleaner = TextCleaner()