def __init__(self, string, result=None): self.tokens = [[tkn, clean_head_tail(tkn), None] for tkn in tokenize(string)] self.pos_string = string # String containing the position of each token (e.g. "%0 %1%2 ... %n") for index, token in enumerate(self.tokens): self.pos_string = self.pos_string.replace(token[0], "%" + str(index), 1) self.result = None if result is not None: self.result = int(result) if sum([len(t[1]) for t in self.tokens if not t[1] is None]) == 0: self.grade = 0 else: self.grade = 3 self.stats = { "orig": Statistics(["lw_char", "up_char", "nb_char", "sp_char"]), "clean": None } tmp_line = re.sub(r'[a-z]', 'a', self.get_orig_line()) # Lower chars replacement tmp_line = re.sub(r'[A-Z]', 'A', tmp_line) # Upper chars replacement tmp_line = re.sub(r'[0-9]', '0', tmp_line) # Numbers replacement tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line) # Special chars replacement line_stats = Counter(tmp_line) self.stats["orig"].set_stat("lw_char", line_stats["a"]) self.stats["orig"].set_stat("up_char", line_stats["A"]) self.stats["orig"].set_stat("nb_char", line_stats["0"]) self.stats["orig"].set_stat("sp_char", line_stats["#"])
def get_clean_stats(self): """Get clean stats of the line Returns: Statistics: Statistics of the clean line """ if self.stats[ "clean"] is None: # Compute clean stats if it is not already done self.stats["clean"] = Statistics( ["lw_char", "up_char", "nb_char", "sp_char"]) tmp_line = re.sub(r'[a-z]', 'a', self.get_clean_line()) # Lower chars replacement tmp_line = re.sub(r'[A-Z]', 'A', tmp_line) # Upper chars replacement tmp_line = re.sub(r'[0-9]', '0', tmp_line) # Numbers replacement tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line) # Special chars replacement line_stats = Counter(tmp_line) self.stats["clean"].set_stat("lw_char", line_stats["a"]) self.stats["clean"].set_stat("up_char", line_stats["A"]) self.stats["clean"].set_stat("nb_char", line_stats["0"]) self.stats["clean"].set_stat("sp_char", line_stats["#"]) return self.stats["clean"]
def __init__(self, fname): self.filename = fname self.text = [] self.contains_training_data = False self.stats = Statistics([ "line_nb", "line_avg_length", "line_total_length", "word_avg_length", "word_total_length", "word_avg_nb", "word_total_nb" ]) self.stats.set_stat("line_nb", 0) self.stats.set_stat("line_avg_length", 0) self.stats.set_stat("line_total_length", 0) self.stats.set_stat("word_avg_length", 0) self.stats.set_stat("word_total_length", 0) self.stats.set_stat("word_avg_nb", 0) self.stats.set_stat("word_total_nb", 0)