Esempio n. 1
0
class TestTextCleaner(unittest.TestCase):

    def setUp(self):
        self.textcleaner = TextCleaner()

    def test_clean(self):
        t = (";D :E born in the U.S.A.! Yeah. A. :-D",
             "I feel sick today :S",
             ":My favourite TV series: The Big Bang Theory",
             "F.B.K.")
        e = ("born in the! Yeah. ",
             "I feel sick today ",
             ":My favourite TV series: The Big Bang Theory",
             "")
        for i, s in enumerate(t):
            self.assertEquals(self.textcleaner.clean_text(s), e[i])

    def test_clean_HTML(self):
        t = ("<div><b>42</b> is the <a href='#'>answer</a></div>",
             "<span>Hello World</span>",
             "<!-- I mustn't read this --> Are comments being filtered?",
             "I don't &amp; like HTML entities &dioji; LO&ppp;L")
        e = ("42 is the answer",
             "Hello World",
             " Are comments being filtered?",
             "I don't  like HTML entities  LOL")
        for i, s in enumerate(t):
            self.assertEquals(self.textcleaner.clean_html_syntax(s), e[i])

    def test_clean_wiki(self):
        t = ("Less taxes for everyone! {{citation needed}}",
             "look here http://google.it/a/lol.html lol :D http://wiki.com",
             "drink a relaxing [Jack Daniel's]",
             "If you want some [Wikipedia:Help] look here",
             "| name =goofy, |city =New York",
             "[File:Case di Volano.jpg|thumb|250px|Volano vista da un dosso]",
             "vicino a [[Calliano (Trentino-Alto Adige)|Calliano]] c'e' un",
             "[[nap:test:Volano (TN)]]",
             "andare in S.Marco",
             "[[Pagina|link fatto male poiche' manca una parentesi quadra " \
             "e c'e' caratteri strani dentro? ;)]",
             "[http://www.nps.gov/ Oklahoma City National Memorial] National")

        e = ("Less taxes for everyone! ",
             "look here  lol :D ",
             "drink a relaxing Jack Daniel's",
             "If you want some Help look here",
             "",
             "Volano vista da un dosso",
             "vicino a Calliano c'e' un",
             "Volano (TN)",
             "andare in S.Marco",
             "link fatto male poiche' manca una parentesi quadra " \
             "e c'e' caratteri strani dentro? ;)",
             " Oklahoma City National Memorial National")

        for i, s in enumerate(t):
            self.assertEquals(self.textcleaner.clean_wiki_syntax(s), e[i])
Esempio n. 2
0
class TestTextCleaner(unittest.TestCase):

    def setUp(self):
        self.textcleaner = TextCleaner()

    def test_clean(self):
        t = (";D :E born in the U.S.A.! Yeah. A. :-D",
             "I feel sick today :S",
             ":My favourite TV series: The Big Bang Theory",
             "F.B.K.")
        e = ("born in the! Yeah. ",
             "I feel sick today ",
             ":My favourite TV series: The Big Bang Theory",
             "")
        for i, s in enumerate(t):
            self.assertEquals(self.textcleaner.clean_text(s), e[i])

    def test_clean_HTML(self):
        t = ("<div><b>42</b> is the <a href='#'>answer</a></div>",
             "<span>Hello World</span>",
             "<!-- I mustn't read this --> Are comments being filtered?",
             "I don't &amp; like HTML entities &dioji; LO&ppp;L")
        e = ("42 is the answer",
             "Hello World",
             " Are comments being filtered?",
             "I don't  like HTML entities  LOL")
        for i, s in enumerate(t):
            self.assertEquals(self.textcleaner.clean_html_syntax(s), e[i])

    def test_clean_wiki(self):
        t = ("Less taxes for everyone! {{citation needed}}",
             "look here http://google.it/a/lol.html lol :D http://wiki.com",
             "drink a relaxing [Jack Daniel's]",
             "If you want some [Wikipedia:Help] look here",
             "| name =goofy, |city =New York",
             "[File:Case di Volano.jpg|thumb|250px|Volano vista da un dosso]",
             "vicino a [[Calliano (Trentino-Alto Adige)|Calliano]] c'e' un",
             "[[nap:test:Volano (TN)]]",
             "andare in S.Marco",
             "[[Pagina|link fatto male poiche' manca una parentesi quadra " \
             "e c'e' caratteri strani dentro? ;)]",
             "[http://www.nps.gov/ Oklahoma City National Memorial] National")

        e = ("Less taxes for everyone! ",
             "look here  lol :D ",
             "drink a relaxing Jack Daniel's",
             "If you want some Help look here",
             "",
             "Volano vista da un dosso",
             "vicino a Calliano c'e' un",
             "Volano (TN)",
             "andare in S.Marco",
             "link fatto male poiche' manca una parentesi quadra " \
             "e c'e' caratteri strani dentro? ;)",
             " Oklahoma City National Memorial National")

        for i, s in enumerate(t):
            self.assertEquals(self.textcleaner.clean_wiki_syntax(s), e[i])
Esempio n. 3
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] dic input_file output_file")
    p.add_option('-l',
                 '--lang',
                 action="store",
                 dest="lang",
                 default="en",
                 help="Wikipedia language")
    p.add_option('-n',
                 '--edits',
                 action="store",
                 dest="edits",
                 type=int,
                 default=500,
                 help="Edit number to consider")
    opts, files = p.parse_args()
    if len(files) != 3:
        p.error("Wrong parameters")
    logging.basicConfig(stream=sys.stderr,
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    csv_reader = csv.reader(open(files[1], "r"))

    textcleaner = TextCleaner()
    pywc = PyWC()
    pywc.set_dic(files[0])

    try:
        cat_names = [
            str(x[1])
            for x in sorted([(int(a), b) for a, b in pywc.categories.items()])
        ]
    except ValueError:
        cat_names = [str(x[1]) for x in sorted(pywc.categories.items())]

    reverse_categories = {}
    for key, value in pywc.categories.iteritems():
        reverse_categories[value] = key

    arcsin_fields = ["%s_arcsin" % key for key in cat_names]

    fields = ["title", "total_edits", "unique_editors", "traumatic",
              "non_traumatic", "natural", "human", "len", "len_cleaned"] + \
             cat_names + arcsin_fields + \
             ["qmarks", "unique", "dic", "sixltr", "total"]

    csv_writer = csv.DictWriter(open(files[2], "w"), fields)

    csv_writer.writeheader()

    for line in csv_reader:
        title, rev = line[0], opts.edits - 1
        revision_id = find_revision_id(title, rev, opts.lang, startid=None)
        if revision_id is None:
            continue
        rev = get_revision(revision_id, opts.lang)

        cleaned_rev = textcleaner.clean_all(rev)
        cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True)

        pywc.parse_col(cleaned_rev)

        result = {
            "title": title,
            "total_edits": line[1],
            "unique_editors": line[2],
            "traumatic": line[3],
            "non_traumatic": line[4],
            "natural": line[5],
            "human": line[6],
            "len": len(rev.split()),
            "len_cleaned": len(cleaned_rev.split()),
            "qmarks": pywc._qmarks,
            "unique": len(pywc._unique),
            "dic": pywc._dic,
            "sixltr": pywc._sixltr,
            "total": pywc._total,
        }

        for key, val in reverse_categories.iteritems():
            score = perc(pywc._results[val], pywc._total) * 100
            arcsin = calc_arcsin(score)
            result[key] = score  # percentage results
            result["%s_arcsin" % key] = arcsin  # arcsin results

        csv_writer.writerow(result)
Esempio n. 4
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] dic input_file output_file")
    p.add_option('-l', '--lang', action="store", dest="lang", default="en",
                 help="Wikipedia language")
    p.add_option('-n', '--edits', action="store", dest="edits", type=int,
                 default=500, help="Edit number to consider")
    opts, files = p.parse_args()
    if len(files) != 3:
        p.error("Wrong parameters")
    logging.basicConfig(stream=sys.stderr,
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    csv_reader = csv.reader(open(files[1], "r"))

    textcleaner = TextCleaner()
    pywc = PyWC()
    pywc.set_dic(files[0])

    try:
        cat_names = [str(x[1]) for x in sorted([(int(a), b) for a, b in
                     pywc.categories.items()])]
    except ValueError:
        cat_names = [str(x[1]) for x in sorted(pywc.categories.items())]

    reverse_categories = {}
    for key, value in pywc.categories.iteritems():
        reverse_categories[value] = key

    arcsin_fields = ["%s_arcsin" % key for key in cat_names]

    fields = ["title", "total_edits", "unique_editors", "traumatic",
              "non_traumatic", "natural", "human", "len", "len_cleaned"] + \
             cat_names + arcsin_fields + \
             ["qmarks", "unique", "dic", "sixltr", "total"]

    csv_writer = csv.DictWriter(open(files[2], "w"), fields)

    csv_writer.writeheader()

    for line in csv_reader:
        title, rev = line[0], opts.edits - 1
        revision_id = find_revision_id(title, rev, opts.lang, startid=None)
        if revision_id is None:
            continue
        rev = get_revision(revision_id, opts.lang)

        cleaned_rev = textcleaner.clean_all(rev)
        cleaned_rev = textcleaner.clean_wiki_syntax(cleaned_rev, True)

        pywc.parse_col(cleaned_rev)

        result = {
            "title": title,
            "total_edits": line[1],
            "unique_editors": line[2],
            "traumatic": line[3],
            "non_traumatic": line[4],
            "natural": line[5],
            "human": line[6],
            "len": len(rev.split()),
            "len_cleaned": len(cleaned_rev.split()),
            "qmarks": pywc._qmarks,
            "unique": len(pywc._unique),
            "dic": pywc._dic,
            "sixltr": pywc._sixltr,
            "total": pywc._total,
        }

        for key, val in reverse_categories.iteritems():
            score = perc(pywc._results[val], pywc._total) * 100
            arcsin = calc_arcsin(score)
            result[key] = score  # percentage results
            result["%s_arcsin" % key] = arcsin  # arcsin results

        csv_writer.writerow(result)