def __init__(self, **kwargs): super(HistoryRevisionsPageProcessor, self).__init__(**kwargs) self.textcleaner = TextCleaner(kwargs["userns"]) self.queue = [] fields = ["timestamp", "lang", "title", "type", "text"] self.csv_writer = csv.DictWriter(self.output, fieldnames=fields, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] page_title output_file") p.add_option('-l', '--lang', action="store", dest="lang", default="en", help="Wikipedia language") p.add_option('-c', '--clean', action="store_true", dest="clean", help="Clean wiki syntax / HTML") opts, files = p.parse_args() if len(files) != 2: p.error("Wrong parameters") logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_writer = csv.writer(open(files[1], "w"), delimiter="\t", quotechar='"', quoting=csv.QUOTE_ALL) textcleaner = None if opts.clean: textcleaner = TextCleaner() get_revisions(files[0], csv_writer, opts.lang, textcleaner)
class PyWC: """ PyWC is a python class for word counting and text analisys. """ # Global proprieties (of the whole source file) categories = None # Dictionary's categories keywords = None # Dictionary's keywords/regex delimiter = "\t" # CSV delimiter quotechar = '"' # CSV quotechar csv_out = sys.stdout # CSV output queue = [] # Flushing queue max_char_limit = 100000 # Max chars per line ignorecols = [] # List of columns of the src file to ignore csv_writer = None # Csv writer handler id_col = 0 # Number of the id column dic_regex = False # Use dictionary made of regex flush_n = 100 # Number of pieces of text to store clean_wiki = None # Clean wiki syntax clean_html = None # Clean HTML percentage = False # Output as percentage tuning = False # Set tuning mode (no conditional dictionary but # a lot faster!) detailed = False # detailed PYWC output per keyword rwords = re.compile(r"[\w']+") rqmarks = re.compile(r"\?") textcleaner = TextCleaner() cond_exp_regex = (re.compile(r"<([\w']+)>(\w+)(\/(\w+)?)?"), re.compile(r"\(([\w\s]+)\)(\w+)(\/(\w+)?)?")) # Local proprieties (of every column of the source file) _id = None # Line ID _results = None # Dictionary where keys are cat ids and # values are counters _qmarks = None # Number of question marks _unique = None # Set of unique words, len() of the set is the number # of unique words _dic = None # Number of words in dic _sixltr = None # Number of words > 6 letters _total = None # Number of total words per column _text = None # Current text to analize _next_word = None # Next word that has to be analized _prev_cat = None # Categories of the last word that has been analized # (useful for conditional exps) _counter = 0 # Generic counter of how many pieces of # text have been analized _keys = None _detailed_data = None # data only for detailed output def __init__(self, **kwargs): self.__dict__ = kwargs def delattrs(self, attrs): """ Frees memory deleting useless attributes of the object """ for attr in attrs: try: delattr(self, attr) except AttributeError: pass def _gen_keyword(self, content): """ Generator for self.keywords (dictionary made of of regexps as keys and thier categories as values) """ for line in content[2].split("\n")[1:-1]: # Comments start with // if line and not line.startswith("//"): line = line.split("\t") # If not using a dictionary made of regexps # it fixes the keyword for regexping # "^" is added at the beginning of every keyword # If keyword doesn't ends with "*", a "$" is added # bad -> ^bad$ matches "bad" but not "badass" # bad* -> ^bad matches "bad" and "badass" if not self.dic_regex: line[0] = "".join(["\\b", line[0]]) try: if (line[0][-1] == "*"): line[0] = line[0][:-1] else: line[0] = "".join([line[0], "\\b"]) except IndexError: continue yield (re.compile(line[0], re.IGNORECASE), line[1:]) def set_dic(self, dic): """ Receives as input the dictionary filename. Reads the dictionary file and populates self.categories and self.keywords """ f = open(dic, 'r') content = f.read() content = content.split("%") if len(content) != 3: raise ValueError("Invalid dic file") # Creates a dictionary where category ids are the keys # and category names are the values. # Splits content at first by new line, then by tab self.categories = dict((line.split("\t") \ for line in content[1].split("\n")[1:-1] if line)) # Creates a dictionary where the compiled regex is the key # and category ids are the values self.keywords = dict(x for x in self._gen_keyword(content)) def flush(self): """ Writes everything which is in the queue in the csv output file """ self.csv_writer.writerows(self.queue) self.queue = [] def save(self): """ Saves current piece of text that has been analized to the queue """ tmp = { "id": self._id, "qmarks": perc(self._qmarks, self._total, self.percentage), "unique": perc(len(self._unique), self._total, self.percentage), "dic": perc(self._dic, self._total, self.percentage), "sixltr": perc(self._sixltr, self._total, self.percentage), "total": self._total, "text": self._text } # Join of self.categories and self._results values for k, v in ((self.categories[x], \ perc(self._results[x], self._total, self.percentage)) \ for x in self.categories): tmp[k] = v self.queue.append(tmp) del tmp self._counter += 1 if self._counter % self.flush_n == 0: logging.info("### Flushing: %d", self._counter) self.flush() def parse_word(self, word): """ Parses a single word with the dictionary of regexps (self.keywords). For every regex that matches, it increments every category they belong to in self._result """ if not self.tuning: cat = [] for regex in self.keywords: if regex.search(word): if self.detailed: self._detailed_data[regex.pattern] += 1 for i in self.keywords[regex]: res = self.cond_exp_regex[0].match(i) if res: if self._next_word == res.group(1): cat.append(res.group(2)) elif res.group(4): cat.append(res.group(4)) continue res = self.cond_exp_regex[1].match(i) if res: if True in [c in self._prev_cat \ for c in res.group(1).split(" ")]: cat.append(res.group(2)) elif res.group(4): cat.append(res.group(4)) continue # If dictionary contains trailing tabs, # '' keys are saved. It skips them. if i: cat.append(i) for c in cat: try: self._results[c] += 1 except KeyError: logging.warn("Invalid category id %s", c) if len(cat) > 0: # Increment word in dictionary counter self._dic += 1 self._prev_cat = cat if len(word) > 6: # Increment word > 6 letters counter self._sixltr += 1 self._total += 1 self._unique.add(word) def parse_col(self, col): """ Reads a single cell of the csv file. It splits it into words and gives them to self.parse_word """ self.delattrs( ("_results", "_qmarks", "_unique", "_dic", "_sixltr", "_total", "_text", "_prev_word", "_prev cat", "_detailed_data")) self._text = col #logging.info("--------PRIMA-----------") #logging.info(self._text) #logging.info("-------------------") if self.clean_wiki or self.clean_html: self._text = self.textcleaner.clean_text(self._text) if self.clean_wiki: self._text = self.textcleaner.clean_wiki_syntax(self._text) if self.clean_html: self._text = self.textcleaner.clean_html_syntax(self._text) #logging.info("--------DOPO------------") #logging.info(self._text) #logging.info("-------------------") self._results = Counter() if self.detailed: self._detailed_data = Counter() self._qmarks = len([m for m in self.rqmarks.findall(self._text)]) self._unique = set() self._dic = 0 self._sixltr = 0 self._total = 0 # create a list of words (_no_ numbers) words = [word for word in self.rwords.findall(self._text) \ if not word.isdigit()] for i, word in enumerate(words): try: self._next_word = words[i + 1] except IndexError: self._next_word = "" self.parse_word(word) if self.tuning: for regex in self.keywords: occ = len(regex.findall(self._text)) if occ: for cat in self.keywords[regex]: if cat: try: self._results[cat] += occ except KeyError: logging.warn("Invalid category id %s", cat) if self.detailed: self._detailed_data[regex.pattern] += occ self._dic += occ def parse_line(self, line): """ Reads a single line of the csv file. Sets self._id and gives the cells that are not in the ignore list to self.parse_col """ self.delattrs(("_id")) self._id = line[self.id_col] for i, col in enumerate(line): if len(col) <= self.max_char_limit: if i != self.id_col and not i in self.ignorecols: self.parse_col(col) self.save() else: logging.warn( " Line %d:%d skipped " "because longer than %d chars", self._counter, i, self.max_char_limit) def start(self, src): """ It starts the file processing. To obtain a sensible output is recommended to run self.set_dic() before. It writes the output csv header and reads every line, passing it to self.parse_line """ # Creates a list of category names sorted by their ID. # Useful because Python dictionaries are not sorted objects! # Sorting like TAWC try: cat_names = [x[1] for x in sorted([(int(a), b) for a, b in \ self.categories.items()])] except ValueError: cat_names = [x[1] for x in sorted(self.categories.items())] self._keys = ["id"] + cat_names + [ "qmarks", "unique", "dic", "sixltr", "total", "text" ] self.csv_writer = csv.DictWriter(self.csv_out, delimiter=self.delimiter, fieldnames=self._keys, quotechar=self.quotechar) self.csv_writer.writeheader() csv_reader = csv.reader(src, delimiter=self.delimiter) for line in csv_reader: self.parse_line(line)
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dic input_file output_file") p.add_option('-l', '--lang', action="store", dest="lang", default="en", help="Wikipedia language") p.add_option('-n', '--edits', action="store", dest="edits", type=int, default=500, help="Edit number to consider") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_reader = csv.reader(open(files[1], "r")) textcleaner = TextCleaner() pywc = PyWC() pywc.set_dic(files[0]) try: cat_names = [ str(x[1]) for x in sorted([(int(a), b) for a, b in pywc.categories.items()]) ] except ValueError: cat_names = [str(x[1]) for x in sorted(pywc.categories.items())] reverse_categories = {} for key, value in pywc.categories.iteritems(): reverse_categories[value] = key arcsin_fields = ["%s_arcsin" % key for key in cat_names] fields = ["title", "total_edits", "unique_editors", "traumatic", "non_traumatic", "natural", "human", "len", "len_cleaned"] + \ cat_names + arcsin_fields + \ ["qmarks", "unique", "dic", "sixltr", "total"] csv_writer = csv.DictWriter(open(files[2], "w"), fields) csv_writer.writeheader() for line in csv_reader: title, rev = line[0], opts.edits - 1 revision_id = find_revision_id(title, rev, opts.lang, startid=None) if revision_id is None: continue rev = get_revision(revision_id, opts.lang) cleaned_rev = textcleaner.clean_all(rev) cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True) pywc.parse_col(cleaned_rev) result = { "title": title, "total_edits": line[1], "unique_editors": line[2], "traumatic": line[3], "non_traumatic": line[4], "natural": line[5], "human": line[6], "len": len(rev.split()), "len_cleaned": len(cleaned_rev.split()), "qmarks": pywc._qmarks, "unique": len(pywc._unique), "dic": pywc._dic, "sixltr": pywc._sixltr, "total": pywc._total, } for key, val in reverse_categories.iteritems(): score = perc(pywc._results[val], pywc._total) * 100 arcsin = calc_arcsin(score) result[key] = score # percentage results result["%s_arcsin" % key] = arcsin # arcsin results csv_writer.writerow(result)
def setUp(self): self.textcleaner = TextCleaner()