Beispiel #1
0
    def __init__(self, string, result=None):
        self.tokens = [[tkn, clean_head_tail(tkn), None]
                       for tkn in tokenize(string)]

        self.pos_string = string  # String containing the position of each token (e.g. "%0 %1%2 ... %n")
        for index, token in enumerate(self.tokens):
            self.pos_string = self.pos_string.replace(token[0],
                                                      "%" + str(index), 1)

        self.result = None
        if result is not None:
            self.result = int(result)

        if sum([len(t[1]) for t in self.tokens if not t[1] is None]) == 0:
            self.grade = 0
        else:
            self.grade = 3

        self.stats = {
            "orig": Statistics(["lw_char", "up_char", "nb_char", "sp_char"]),
            "clean": None
        }

        tmp_line = re.sub(r'[a-z]', 'a',
                          self.get_orig_line())  # Lower chars replacement
        tmp_line = re.sub(r'[A-Z]', 'A', tmp_line)  # Upper chars replacement
        tmp_line = re.sub(r'[0-9]', '0', tmp_line)  # Numbers replacement
        tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#',
                          tmp_line)  # Special chars replacement
        line_stats = Counter(tmp_line)

        self.stats["orig"].set_stat("lw_char", line_stats["a"])
        self.stats["orig"].set_stat("up_char", line_stats["A"])
        self.stats["orig"].set_stat("nb_char", line_stats["0"])
        self.stats["orig"].set_stat("sp_char", line_stats["#"])
Beispiel #2
0
    def get_clean_stats(self):
        """Get clean stats of the line

        Returns:
            Statistics: Statistics of the clean line
        """
        if self.stats[
                "clean"] is None:  # Compute clean stats if it is not already done
            self.stats["clean"] = Statistics(
                ["lw_char", "up_char", "nb_char", "sp_char"])

            tmp_line = re.sub(r'[a-z]', 'a',
                              self.get_clean_line())  # Lower chars replacement
            tmp_line = re.sub(r'[A-Z]', 'A',
                              tmp_line)  # Upper chars replacement
            tmp_line = re.sub(r'[0-9]', '0', tmp_line)  # Numbers replacement
            tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#',
                              tmp_line)  # Special chars replacement
            line_stats = Counter(tmp_line)

            self.stats["clean"].set_stat("lw_char", line_stats["a"])
            self.stats["clean"].set_stat("up_char", line_stats["A"])
            self.stats["clean"].set_stat("nb_char", line_stats["0"])
            self.stats["clean"].set_stat("sp_char", line_stats["#"])

        return self.stats["clean"]
Beispiel #3
0
    def __init__(self, fname):
        self.filename = fname
        self.text = []
        self.contains_training_data = False

        self.stats = Statistics([
            "line_nb", "line_avg_length", "line_total_length",
            "word_avg_length", "word_total_length", "word_avg_nb",
            "word_total_nb"
        ])
        self.stats.set_stat("line_nb", 0)
        self.stats.set_stat("line_avg_length", 0)
        self.stats.set_stat("line_total_length", 0)
        self.stats.set_stat("word_avg_length", 0)
        self.stats.set_stat("word_total_length", 0)
        self.stats.set_stat("word_avg_nb", 0)
        self.stats.set_stat("word_total_nb", 0)