def load_input(input_file): lines = read(input_file).strip().split("\n") if lines[0][0] == "#": lines = lines[1:] content = [line.split("\t")[0] for line in lines] content = u" ".join(content) return content
def raw_to_corpus(): file = join(dirname(__file__), "raw", "acts.json") data = read(file) posts = json.loads(data) posts = [transform_post(p) for p in posts] posts = [p for p in posts if filter_post(p)] rows = [get_row(p) for p in posts] convert_to_corpus(rows)
import json from os.path import join from pprint import pprint from sklearn.metrics import confusion_matrix from underthesea.util.file_io import read def convert_cm_to_log(cm, labels, line=5): cm = cm.tolist() # cm = [" ".join([("%-" + str(line) + "s") % labels[index]] + map(lambda i: ("%" + str(line) + "d") % i, row)) for index, row in enumerate(cm)] cm_ = [] for index, row in enumerate(cm): content = " ".join([("%-" + str(line) + "s") % labels[index]] + map(lambda i: ("%" + str(line) + "d") % i, row)) cm_.append(content) title = " " * (line + 1) + " ".join( map(lambda i: ("%" + str(line) + "s") % i, labels)) cm.insert(0, title) return cm # results = json.loads(read(join("logs", "20171006_153955", "result.json"))) results = json.loads(read(join("logs", "20171006_161437", "result.json"))) print(0) actual = results["actual"] expected = results["expected"] labels = list(set(expected).union(set(actual))) cm = confusion_matrix(expected, actual, labels) cm = convert_cm_to_log(cm, labels) pprint(cm, indent=2)
def load_output(filename): lines = [text.split("\t") for text in read(filename).strip().split("\n")] output = [tuple(item) for item in lines] return output
def words(self): if not self.words_data: content = read(self.data_file).strip() words = content.split("\n") self.words_data = words return self.words_data
def load_output(input_file): lines = read(input_file).strip().split("\n") if lines[0][0] == "#": lines = lines[1:] text = "\n".join(lines) return text
def load_input(input_file): text = read(input_file) text = text.split("\n")[0] return text
def load_output(output_file): return read(output_file).strip().split("\n")