Esempio n. 1
0
 def __init__(self, **kwargs):
     super(HistoryRevisionsPageProcessor, self).__init__(**kwargs)
     self.textcleaner = TextCleaner(kwargs["userns"])
     self.queue = []
     fields = ["timestamp", "lang", "title", "type", "text"]
     self.csv_writer = csv.DictWriter(self.output,
                                      fieldnames=fields,
                                      delimiter='\t',
                                      quotechar='"',
                                      quoting=csv.QUOTE_ALL)
Esempio n. 2
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] page_title output_file")
    p.add_option('-l', '--lang', action="store", dest="lang", default="en",
                 help="Wikipedia language")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 help="Clean wiki syntax / HTML")
    opts, files = p.parse_args()
    if len(files) != 2:
        p.error("Wrong parameters")
    logging.basicConfig(stream=sys.stderr,
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    csv_writer = csv.writer(open(files[1], "w"),
                 delimiter="\t",
                 quotechar='"',
                 quoting=csv.QUOTE_ALL)
    textcleaner = None
    if opts.clean:
        textcleaner = TextCleaner()
    get_revisions(files[0], csv_writer, opts.lang, textcleaner)
Esempio n. 3
0
class PyWC:
    """
    PyWC is a python class for word counting and text analisys.
    """
    # Global proprieties (of the whole source file)
    categories = None  # Dictionary's categories
    keywords = None  # Dictionary's keywords/regex
    delimiter = "\t"  # CSV delimiter
    quotechar = '"'  # CSV quotechar
    csv_out = sys.stdout  # CSV output
    queue = []  # Flushing queue
    max_char_limit = 100000  # Max chars per line
    ignorecols = []  # List of columns of the src file to ignore
    csv_writer = None  # Csv writer handler
    id_col = 0  # Number of the id column
    dic_regex = False  # Use dictionary made of regex
    flush_n = 100  # Number of pieces of text to store
    clean_wiki = None  # Clean wiki syntax
    clean_html = None  # Clean HTML
    percentage = False  # Output as percentage
    tuning = False  # Set tuning mode (no conditional dictionary but
    # a lot faster!)
    detailed = False  # detailed PYWC output per keyword

    rwords = re.compile(r"[\w']+")
    rqmarks = re.compile(r"\?")

    textcleaner = TextCleaner()

    cond_exp_regex = (re.compile(r"<([\w']+)>(\w+)(\/(\w+)?)?"),
                      re.compile(r"\(([\w\s]+)\)(\w+)(\/(\w+)?)?"))

    # Local proprieties (of every column of the source file)
    _id = None  # Line ID
    _results = None  # Dictionary where keys are cat ids and
    # values are counters
    _qmarks = None  # Number of question marks
    _unique = None  # Set of unique words, len() of the set is the number
    # of unique words
    _dic = None  # Number of words in dic
    _sixltr = None  # Number of words > 6 letters
    _total = None  # Number of total words per column
    _text = None  # Current text to analize
    _next_word = None  # Next word that has to be analized
    _prev_cat = None  # Categories of the last word that has been analized
    # (useful for conditional exps)
    _counter = 0  # Generic counter of how many pieces of
    # text have been analized
    _keys = None
    _detailed_data = None  # data only for detailed output

    def __init__(self, **kwargs):
        self.__dict__ = kwargs

    def delattrs(self, attrs):
        """
        Frees memory deleting useless attributes of the object
        """
        for attr in attrs:
            try:
                delattr(self, attr)
            except AttributeError:
                pass

    def _gen_keyword(self, content):
        """
        Generator for self.keywords (dictionary made of of regexps
        as keys and thier categories as values)
        """
        for line in content[2].split("\n")[1:-1]:
            # Comments start with //
            if line and not line.startswith("//"):
                line = line.split("\t")
                # If not using a dictionary made of regexps
                # it fixes the keyword for regexping
                # "^" is added at the beginning of every keyword
                # If keyword doesn't ends with "*", a "$" is added
                # bad -> ^bad$ matches "bad" but not "badass"
                # bad* -> ^bad matches "bad" and "badass"
                if not self.dic_regex:
                    line[0] = "".join(["\\b", line[0]])
                    try:
                        if (line[0][-1] == "*"):
                            line[0] = line[0][:-1]
                        else:
                            line[0] = "".join([line[0], "\\b"])
                    except IndexError:
                        continue
                yield (re.compile(line[0], re.IGNORECASE), line[1:])

    def set_dic(self, dic):
        """
        Receives as input the dictionary filename.
        Reads the dictionary file and populates self.categories and
        self.keywords
        """
        f = open(dic, 'r')
        content = f.read()
        content = content.split("%")
        if len(content) != 3:
            raise ValueError("Invalid dic file")

        # Creates a dictionary where category ids are the keys
        # and category names are the values.
        # Splits content at first by new line, then by tab
        self.categories = dict((line.split("\t") \
                for line in content[1].split("\n")[1:-1] if line))

        # Creates a dictionary where the compiled regex is the key
        # and category ids are the values
        self.keywords = dict(x for x in self._gen_keyword(content))

    def flush(self):
        """
        Writes everything which is in the queue in the csv output file
        """
        self.csv_writer.writerows(self.queue)
        self.queue = []

    def save(self):
        """
        Saves current piece of text that has been analized to the queue
        """
        tmp = {
            "id": self._id,
            "qmarks": perc(self._qmarks, self._total, self.percentage),
            "unique": perc(len(self._unique), self._total, self.percentage),
            "dic": perc(self._dic, self._total, self.percentage),
            "sixltr": perc(self._sixltr, self._total, self.percentage),
            "total": self._total,
            "text": self._text
        }
        # Join of self.categories and self._results values
        for k, v in ((self.categories[x], \
                      perc(self._results[x], self._total, self.percentage)) \
                    for x in self.categories):
            tmp[k] = v
        self.queue.append(tmp)
        del tmp
        self._counter += 1
        if self._counter % self.flush_n == 0:
            logging.info("### Flushing: %d", self._counter)
            self.flush()

    def parse_word(self, word):
        """
        Parses a single word with the dictionary of regexps
        (self.keywords). For every regex that matches, it
        increments every category they belong to in self._result
        """
        if not self.tuning:
            cat = []
            for regex in self.keywords:
                if regex.search(word):
                    if self.detailed:
                        self._detailed_data[regex.pattern] += 1
                    for i in self.keywords[regex]:
                        res = self.cond_exp_regex[0].match(i)
                        if res:
                            if self._next_word == res.group(1):
                                cat.append(res.group(2))
                            elif res.group(4):
                                cat.append(res.group(4))
                            continue

                        res = self.cond_exp_regex[1].match(i)
                        if res:
                            if True in [c in self._prev_cat \
                                        for c in res.group(1).split(" ")]:
                                cat.append(res.group(2))
                            elif res.group(4):
                                cat.append(res.group(4))
                            continue

                        # If dictionary contains trailing tabs,
                        # '' keys are saved. It skips them.
                        if i:
                            cat.append(i)

            for c in cat:
                try:
                    self._results[c] += 1
                except KeyError:
                    logging.warn("Invalid category id %s", c)
            if len(cat) > 0:  # Increment word in dictionary counter
                self._dic += 1
            self._prev_cat = cat

        if len(word) > 6:  # Increment word > 6 letters counter
            self._sixltr += 1
        self._total += 1
        self._unique.add(word)

    def parse_col(self, col):
        """
        Reads a single cell of the csv file. It splits it
        into words and gives them to self.parse_word
        """
        self.delattrs(
            ("_results", "_qmarks", "_unique", "_dic", "_sixltr", "_total",
             "_text", "_prev_word", "_prev cat", "_detailed_data"))
        self._text = col
        #logging.info("--------PRIMA-----------")
        #logging.info(self._text)
        #logging.info("-------------------")
        if self.clean_wiki or self.clean_html:
            self._text = self.textcleaner.clean_text(self._text)
        if self.clean_wiki:
            self._text = self.textcleaner.clean_wiki_syntax(self._text)
        if self.clean_html:
            self._text = self.textcleaner.clean_html_syntax(self._text)
        #logging.info("--------DOPO------------")
        #logging.info(self._text)
        #logging.info("-------------------")
        self._results = Counter()
        if self.detailed:
            self._detailed_data = Counter()
        self._qmarks = len([m for m in self.rqmarks.findall(self._text)])
        self._unique = set()
        self._dic = 0
        self._sixltr = 0
        self._total = 0
        # create a list of words (_no_ numbers)
        words = [word for word in self.rwords.findall(self._text) \
                 if not word.isdigit()]

        for i, word in enumerate(words):
            try:
                self._next_word = words[i + 1]
            except IndexError:
                self._next_word = ""
            self.parse_word(word)

        if self.tuning:
            for regex in self.keywords:
                occ = len(regex.findall(self._text))
                if occ:
                    for cat in self.keywords[regex]:
                        if cat:
                            try:
                                self._results[cat] += occ
                            except KeyError:
                                logging.warn("Invalid category id %s", cat)
                    if self.detailed:
                        self._detailed_data[regex.pattern] += occ
                self._dic += occ

    def parse_line(self, line):
        """
        Reads a single line of the csv file.
        Sets self._id and gives the cells that are not in the ignore
        list to self.parse_col
        """
        self.delattrs(("_id"))
        self._id = line[self.id_col]
        for i, col in enumerate(line):
            if len(col) <= self.max_char_limit:
                if i != self.id_col and not i in self.ignorecols:
                    self.parse_col(col)
                    self.save()
            else:
                logging.warn(
                    " Line %d:%d skipped "
                    "because longer than %d chars", self._counter, i,
                    self.max_char_limit)

    def start(self, src):
        """
        It starts the file processing.
        To obtain a sensible output is recommended to run self.set_dic()
        before.
        It writes the output csv header and reads every line, passing
        it to self.parse_line
        """

        # Creates a list of category names sorted by their ID.
        # Useful because Python dictionaries are not sorted objects!
        # Sorting like TAWC
        try:
            cat_names = [x[1] for x in sorted([(int(a), b) for a, b in \
                                               self.categories.items()])]
        except ValueError:
            cat_names = [x[1] for x in sorted(self.categories.items())]

        self._keys = ["id"] + cat_names + [
            "qmarks", "unique", "dic", "sixltr", "total", "text"
        ]
        self.csv_writer = csv.DictWriter(self.csv_out,
                                         delimiter=self.delimiter,
                                         fieldnames=self._keys,
                                         quotechar=self.quotechar)
        self.csv_writer.writeheader()
        csv_reader = csv.reader(src, delimiter=self.delimiter)
        for line in csv_reader:
            self.parse_line(line)
Esempio n. 4
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] dic input_file output_file")
    p.add_option('-l',
                 '--lang',
                 action="store",
                 dest="lang",
                 default="en",
                 help="Wikipedia language")
    p.add_option('-n',
                 '--edits',
                 action="store",
                 dest="edits",
                 type=int,
                 default=500,
                 help="Edit number to consider")
    opts, files = p.parse_args()
    if len(files) != 3:
        p.error("Wrong parameters")
    logging.basicConfig(stream=sys.stderr,
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    csv_reader = csv.reader(open(files[1], "r"))

    textcleaner = TextCleaner()
    pywc = PyWC()
    pywc.set_dic(files[0])

    try:
        cat_names = [
            str(x[1])
            for x in sorted([(int(a), b) for a, b in pywc.categories.items()])
        ]
    except ValueError:
        cat_names = [str(x[1]) for x in sorted(pywc.categories.items())]

    reverse_categories = {}
    for key, value in pywc.categories.iteritems():
        reverse_categories[value] = key

    arcsin_fields = ["%s_arcsin" % key for key in cat_names]

    fields = ["title", "total_edits", "unique_editors", "traumatic",
              "non_traumatic", "natural", "human", "len", "len_cleaned"] + \
             cat_names + arcsin_fields + \
             ["qmarks", "unique", "dic", "sixltr", "total"]

    csv_writer = csv.DictWriter(open(files[2], "w"), fields)

    csv_writer.writeheader()

    for line in csv_reader:
        title, rev = line[0], opts.edits - 1
        revision_id = find_revision_id(title, rev, opts.lang, startid=None)
        if revision_id is None:
            continue
        rev = get_revision(revision_id, opts.lang)

        cleaned_rev = textcleaner.clean_all(rev)
        cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True)

        pywc.parse_col(cleaned_rev)

        result = {
            "title": title,
            "total_edits": line[1],
            "unique_editors": line[2],
            "traumatic": line[3],
            "non_traumatic": line[4],
            "natural": line[5],
            "human": line[6],
            "len": len(rev.split()),
            "len_cleaned": len(cleaned_rev.split()),
            "qmarks": pywc._qmarks,
            "unique": len(pywc._unique),
            "dic": pywc._dic,
            "sixltr": pywc._sixltr,
            "total": pywc._total,
        }

        for key, val in reverse_categories.iteritems():
            score = perc(pywc._results[val], pywc._total) * 100
            arcsin = calc_arcsin(score)
            result[key] = score  # percentage results
            result["%s_arcsin" % key] = arcsin  # arcsin results

        csv_writer.writerow(result)
Esempio n. 5
0
 def setUp(self):
     self.textcleaner = TextCleaner()