def get_clean_stats(self): """Get clean stats of the line Returns: Statistics: Statistics of the clean line """ if self.stats[ "clean"] is None: # Compute clean stats if it is not already done self.stats["clean"] = Statistics( ["lw_char", "up_char", "nb_char", "sp_char"]) tmp_line = re.sub(r'[a-z]', 'a', self.get_clean_line()) # Lower chars replacement tmp_line = re.sub(r'[A-Z]', 'A', tmp_line) # Upper chars replacement tmp_line = re.sub(r'[0-9]', '0', tmp_line) # Numbers replacement tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line) # Special chars replacement line_stats = Counter(tmp_line) self.stats["clean"].set_stat("lw_char", line_stats["a"]) self.stats["clean"].set_stat("up_char", line_stats["A"]) self.stats["clean"].set_stat("nb_char", line_stats["0"]) self.stats["clean"].set_stat("sp_char", line_stats["#"]) return self.stats["clean"]
def __init__(self, string, result=None): self.tokens = [[tkn, clean_head_tail(tkn), None] for tkn in tokenize(string)] self.pos_string = string # String containing the position of each token (e.g. "%0 %1%2 ... %n") for index, token in enumerate(self.tokens): self.pos_string = self.pos_string.replace(token[0], "%" + str(index), 1) self.result = None if result is not None: self.result = int(result) if sum([len(t[1]) for t in self.tokens if not t[1] is None]) == 0: self.grade = 0 else: self.grade = 3 self.stats = { "orig": Statistics(["lw_char", "up_char", "nb_char", "sp_char"]), "clean": None } tmp_line = re.sub(r'[a-z]', 'a', self.get_orig_line()) # Lower chars replacement tmp_line = re.sub(r'[A-Z]', 'A', tmp_line) # Upper chars replacement tmp_line = re.sub(r'[0-9]', '0', tmp_line) # Numbers replacement tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line) # Special chars replacement line_stats = Counter(tmp_line) self.stats["orig"].set_stat("lw_char", line_stats["a"]) self.stats["orig"].set_stat("up_char", line_stats["A"]) self.stats["orig"].set_stat("nb_char", line_stats["0"]) self.stats["orig"].set_stat("sp_char", line_stats["#"])
def __init__(self, fname): self.filename = fname self.text = [] self.contains_training_data = False self.stats = Statistics([ "line_nb", "line_avg_length", "line_total_length", "word_avg_length", "word_total_length", "word_avg_nb", "word_total_nb" ]) self.stats.set_stat("line_nb", 0) self.stats.set_stat("line_avg_length", 0) self.stats.set_stat("line_total_length", 0) self.stats.set_stat("word_avg_length", 0) self.stats.set_stat("word_total_length", 0) self.stats.set_stat("word_avg_nb", 0) self.stats.set_stat("word_total_nb", 0)
def __init__(self, fname): self.filename = fname self.text = [] self.contains_training_data = False self.stats = Statistics(["line_nb", "line_avg_length", "line_total_length", "word_avg_length", "word_total_length", "word_avg_nb", "word_total_nb"]) self.stats.set_stat("line_nb", 0) self.stats.set_stat("line_avg_length", 0) self.stats.set_stat("line_total_length", 0) self.stats.set_stat("word_avg_length", 0) self.stats.set_stat("word_total_length", 0) self.stats.set_stat("word_avg_nb", 0) self.stats.set_stat("word_total_nb", 0)
class Text(object): """Stores the the text from a filename given in parameters Args: fname (str): Path to the file. Attributes: filename (:func:`str`): Name of the file. text (:func:`list`): List of paragraphs. Every paragraph is a list of :class:`.Line`. stats (:class:`.Statistics`): Statistics object. """ def __init__(self, fname): self.filename = fname self.text = [] self.contains_training_data = False self.stats = Statistics([ "line_nb", "line_avg_length", "line_total_length", "word_avg_length", "word_total_length", "word_avg_nb", "word_total_nb" ]) self.stats.set_stat("line_nb", 0) self.stats.set_stat("line_avg_length", 0) self.stats.set_stat("line_total_length", 0) self.stats.set_stat("word_avg_length", 0) self.stats.set_stat("word_total_length", 0) self.stats.set_stat("word_avg_nb", 0) self.stats.set_stat("word_total_nb", 0) def read_csv(self): """Read a CSV file and build the associated text object Returns: `Text` """ self.contains_training_data = True with open(self.filename, "r") as f: csv_reader = csv.reader(f) paragraph = [] for row in csv_reader: if len(row) != 2: if len(paragraph) != 0: self.text.append(paragraph) paragraph = [] continue line = unicode(row[0].decode("utf-8")) line = line.strip(" \t\r\n") if len(line) == 0: if len(paragraph) != 0: self.text.append(paragraph) paragraph = [] continue line_object = Line(line, row[1]) paragraph.append(line_object) self.stats.set_stat("line_nb", self.stats.get_stat("line_nb") + 1) self.stats.set_stat( "line_total_length", self.stats.get_stat("line_total_length") + len(line_object)) self.stats.set_stat( "word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens)) words_len = sum([len(tkn) for tkn in line_object.tokens]) self.stats.set_stat( "word_total_length", self.stats.get_stat("word_total_length") + words_len) if len(paragraph) != 0: self.text.append(paragraph) self.stats.set_stat( "line_avg_length", self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb")) self.stats.set_stat( "word_avg_length", self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb")) self.stats.set_stat( "word_avg_nb", self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb")) logging.debug(self.filename + " read") def read_txt(self): """Read a text file and build the associated text object Returns: `Text` """ self.contains_training_data = False with codecs.open(self.filename, "rb", encoding="utf-8") as f: paragraph = [] for line in f: line = line.strip(" \t\r\n") if len(line) == 0: if len(paragraph) != 0: self.text.append(paragraph) paragraph = [] continue line_object = Line(line) paragraph.append(line_object) self.stats.set_stat("line_nb", self.stats.get_stat("line_nb") + 1) self.stats.set_stat( "line_total_length", self.stats.get_stat("line_total_length") + len(line_object)) self.stats.set_stat( "word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens)) words_len = sum([len(tkn) for tkn in line_object.tokens]) self.stats.set_stat( "word_total_length", self.stats.get_stat("word_total_length") + words_len) if len(paragraph) != 0: self.text.append(paragraph) self.stats.set_stat( "line_avg_length", self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb")) self.stats.set_stat( "word_avg_length", self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb")) self.stats.set_stat( "word_avg_nb", self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb")) logging.debug(self.filename + " read") def get_clean_lines(self): """Returns cleans line from the text object Returns: list: List of clean lines """ lines = [] for paragraph in self.text: for line in paragraph: if line.grade == 5: lines.append(line.get_clean_line()) if len(lines) > 0 and lines[-1] != "": lines.append("") return lines def get_garbage_lines(self): """Returns garbage lines from the text object Returns: list: List of garbage lines """ lines = [] for paragraph in self.text: for line in paragraph: if line.grade == 0: lines.append(line.get_orig_line()) if len(lines) > 0 and lines[-1] != "": lines.append("") return lines def get_unclassified_lines(self): """Returns unclassified lines from the text object Returns: list: List of unclassified lines """ lines = [] for paragraph in self.text: for line in paragraph: if line.grade % 5 != 0: # Grade is not 0 nor 5 lines.append(line.get_orig_line()) if len(lines) > 0 and lines[-1] != "": lines.append("") return lines def retrieve_text_score(self): """Returns some stats and score regarding classification Returns: dict: Dictionary containing the results """ # True positive is a garbage string detected as such score_stats = {"FP": 0, "TP": 0, "FN": 0, "TN": 0} class_stats = {"classified": 0, "unclassified": 0, "unrated": 0} for paragraph in self.text: for line in paragraph: if line.grade != 0 and line.grade != 5: class_stats["unclassified"] += 1 continue if line.result is None or line.result < 0: class_stats["unrated"] += 1 continue class_stats["classified"] += 1 if line.grade == 0: # Line detected as garbage if line.result == 1: # Line is clean score_stats["FP"] += 1 # False positive else: # Line is garbage score_stats["TP"] += 1 # True postive else: # Line detected as clean if line.result == 1: # Line is clean score_stats["TN"] += 1 # True negative else: # Line is garbage score_stats["FN"] += 1 # False negative # Precision divider_pr = score_stats["TP"] + score_stats["FP"] if divider_pr != 0: precision = score_stats["TP"] / divider_pr else: precision = 0 # Recall divider_rc = score_stats["TP"] + score_stats["FN"] if divider_rc != 0: recall = score_stats["TP"] / divider_rc else: recall = 0 # F1 score if precision + recall != 0: f1 = 2 * precision * recall / (precision + recall) else: f1 = 0 return { "class": class_stats, "score": { "precision": precision, "recall": recall, "f1": f1 }, "raw": score_stats }
class Text(object): """Stores the the text from a filename given in parameters Args: fname (str): Path to the file. Attributes: filename (:func:`str`): Name of the file. text (:func:`list`): List of paragraphs. Every paragraph is a list of :class:`.Line`. stats (:class:`.Statistics`): Statistics object. """ def __init__(self, fname): self.filename = fname self.text = [] self.contains_training_data = False self.stats = Statistics(["line_nb", "line_avg_length", "line_total_length", "word_avg_length", "word_total_length", "word_avg_nb", "word_total_nb"]) self.stats.set_stat("line_nb", 0) self.stats.set_stat("line_avg_length", 0) self.stats.set_stat("line_total_length", 0) self.stats.set_stat("word_avg_length", 0) self.stats.set_stat("word_total_length", 0) self.stats.set_stat("word_avg_nb", 0) self.stats.set_stat("word_total_nb", 0) def read_csv(self): """Read a CSV file and build the associated text object Returns: `Text` """ self.contains_training_data = True with open(self.filename, "r") as f: csv_reader = csv.reader(f) paragraph = [] for row in csv_reader: if len(row) != 2: if len(paragraph) != 0: self.text.append(paragraph) paragraph = [] continue line = unicode(row[0].decode("utf-8")) line = line.strip(" \t\r\n") if len(line) == 0: if len(paragraph) != 0: self.text.append(paragraph) paragraph = [] continue line_object = Line(line, row[1]) paragraph.append(line_object) self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1) self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object)) self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens)) words_len = sum([len(tkn) for tkn in line_object.tokens]) self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len) if len(paragraph) != 0: self.text.append(paragraph) self.stats.set_stat("line_avg_length", self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb")) self.stats.set_stat("word_avg_length", self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb")) self.stats.set_stat("word_avg_nb", self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb")) logging.debug(self.filename+" read") def read_txt(self): """Read a text file and build the associated text object Returns: `Text` """ self.contains_training_data = False with codecs.open(self.filename, "rb", encoding="utf-8") as f: paragraph = [] for line in f: line = line.strip(" \t\r\n") if len(line) == 0: if len(paragraph) != 0: self.text.append(paragraph) paragraph = [] continue line_object = Line(line) paragraph.append(line_object) self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1) self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object)) self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens)) words_len = sum([len(tkn) for tkn in line_object.tokens]) self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len) if len(paragraph) != 0: self.text.append(paragraph) self.stats.set_stat("line_avg_length", self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb")) self.stats.set_stat("word_avg_length", self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb")) self.stats.set_stat("word_avg_nb", self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb")) logging.debug(self.filename+" read") def get_clean_lines(self): """Returns cleans line from the text object Returns: list: List of clean lines """ lines = [] for paragraph in self.text: for line in paragraph: if line.grade == 5: lines.append(line.get_clean_line()) if len(lines) > 0 and lines[-1] != "": lines.append("") return lines def get_garbage_lines(self): """Returns garbage lines from the text object Returns: list: List of garbage lines """ lines = [] for paragraph in self.text: for line in paragraph: if line.grade == 0: lines.append(line.get_orig_line()) if len(lines) > 0 and lines[-1] != "": lines.append("") return lines def get_unclassified_lines(self): """Returns unclassified lines from the text object Returns: list: List of unclassified lines """ lines = [] for paragraph in self.text: for line in paragraph: if line.grade % 5 != 0: # Grade is not 0 nor 5 lines.append(line.get_orig_line()) if len(lines) > 0 and lines[-1] != "": lines.append("") return lines def retrieve_text_score(self): """Returns some stats and score regarding classification Returns: dict: Dictionary containing the results """ # True positive is a garbage string detected as such score_stats = {"FP": 0, "TP": 0, "FN": 0, "TN": 0} class_stats = {"classified": 0, "unclassified": 0, "unrated": 0} for paragraph in self.text: for line in paragraph: if line.grade != 0 and line.grade != 5: class_stats["unclassified"] += 1 continue if line.result is None or line.result < 0: class_stats["unrated"] += 1 continue class_stats["classified"] += 1 if line.grade == 0: # Line detected as garbage if line.result == 1: # Line is clean score_stats["FP"] += 1 # False positive else: # Line is garbage score_stats["TP"] += 1 # True postive else: # Line detected as clean if line.result == 1: # Line is clean score_stats["TN"] += 1 # True negative else: # Line is garbage score_stats["FN"] += 1 # False negative # Precision divider_pr = score_stats["TP"] + score_stats["FP"] if divider_pr != 0: precision = score_stats["TP"] / divider_pr else: precision = 0 # Recall divider_rc = score_stats["TP"] + score_stats["FN"] if divider_rc != 0: recall = score_stats["TP"] / divider_rc else: recall = 0 # F1 score if precision + recall != 0: f1 = 2 * precision * recall / (precision + recall) else: f1 = 0 return { "class": class_stats, "score": { "precision": precision, "recall": recall, "f1": f1 }, "raw": score_stats }