class TestPyWC(unittest.TestCase): def setUp(self): self.pywc = PyWC() self.pywc.set_dic("tests/pywc/simple-dic.dic") def test_read_dic(self): self.assertEquals(len(self.pywc.categories), 4) self.assertEquals(len(self.pywc.keywords), 9) def test_output(self): expected = "".join([line for line in \ open("tests/pywc/pywc_expected.csv")]) self.pywc.csv_out = open("tests/pywc/pywc_result.csv", "w") src = open("tests/pywc/pywc_input.csv") self.pywc.start(src) self.pywc.flush() self.pywc.csv_out.close() result = "".join([line for line in open("tests/pywc/pywc_result.csv")]) self.assertEquals(expected, result)
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dic input_file output_file") p.add_option('-l', '--lang', action="store", dest="lang", default="en", help="Wikipedia language") p.add_option('-n', '--edits', action="store", dest="edits", type=int, default=500, help="Edit number to consider") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_reader = csv.reader(open(files[1], "r")) textcleaner = TextCleaner() pywc = PyWC() pywc.set_dic(files[0]) try: cat_names = [ str(x[1]) for x in sorted([(int(a), b) for a, b in pywc.categories.items()]) ] except ValueError: cat_names = [str(x[1]) for x in sorted(pywc.categories.items())] reverse_categories = {} for key, value in pywc.categories.iteritems(): reverse_categories[value] = key arcsin_fields = ["%s_arcsin" % key for key in cat_names] fields = ["title", "total_edits", "unique_editors", "traumatic", "non_traumatic", "natural", "human", "len", "len_cleaned"] + \ cat_names + arcsin_fields + \ ["qmarks", "unique", "dic", "sixltr", "total"] csv_writer = csv.DictWriter(open(files[2], "w"), fields) csv_writer.writeheader() for line in csv_reader: title, rev = line[0], opts.edits - 1 revision_id = find_revision_id(title, rev, opts.lang, startid=None) if revision_id is None: continue rev = get_revision(revision_id, opts.lang) cleaned_rev = textcleaner.clean_all(rev) cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True) pywc.parse_col(cleaned_rev) result = { "title": title, "total_edits": line[1], "unique_editors": line[2], "traumatic": line[3], "non_traumatic": line[4], "natural": line[5], "human": line[6], "len": len(rev.split()), "len_cleaned": len(cleaned_rev.split()), "qmarks": pywc._qmarks, "unique": len(pywc._unique), "dic": pywc._dic, "sixltr": pywc._sixltr, "total": pywc._total, } for key, val in reverse_categories.iteritems(): score = perc(pywc._results[val], pywc._total) * 100 arcsin = calc_arcsin(score) result[key] = score # percentage results result["%s_arcsin" % key] = arcsin # arcsin results csv_writer.writerow(result)
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dic input_file output_file") p.add_option('-l', '--lang', action="store", dest="lang", default="en", help="Wikipedia language") p.add_option('-n', '--edits', action="store", dest="edits", type=int, default=500, help="Edit number to consider") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_reader = csv.reader(open(files[1], "r")) textcleaner = TextCleaner() pywc = PyWC() pywc.set_dic(files[0]) try: cat_names = [str(x[1]) for x in sorted([(int(a), b) for a, b in pywc.categories.items()])] except ValueError: cat_names = [str(x[1]) for x in sorted(pywc.categories.items())] reverse_categories = {} for key, value in pywc.categories.iteritems(): reverse_categories[value] = key arcsin_fields = ["%s_arcsin" % key for key in cat_names] fields = ["title", "total_edits", "unique_editors", "traumatic", "non_traumatic", "natural", "human", "len", "len_cleaned"] + \ cat_names + arcsin_fields + \ ["qmarks", "unique", "dic", "sixltr", "total"] csv_writer = csv.DictWriter(open(files[2], "w"), fields) csv_writer.writeheader() for line in csv_reader: title, rev = line[0], opts.edits - 1 revision_id = find_revision_id(title, rev, opts.lang, startid=None) if revision_id is None: continue rev = get_revision(revision_id, opts.lang) cleaned_rev = textcleaner.clean_all(rev) cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True) pywc.parse_col(cleaned_rev) result = { "title": title, "total_edits": line[1], "unique_editors": line[2], "traumatic": line[3], "non_traumatic": line[4], "natural": line[5], "human": line[6], "len": len(rev.split()), "len_cleaned": len(cleaned_rev.split()), "qmarks": pywc._qmarks, "unique": len(pywc._unique), "dic": pywc._dic, "sixltr": pywc._sixltr, "total": pywc._total, } for key, val in reverse_categories.iteritems(): score = perc(pywc._results[val], pywc._total) * 100 arcsin = calc_arcsin(score) result[key] = score # percentage results result["%s_arcsin" % key] = arcsin # arcsin results csv_writer.writerow(result)