def data_summary(): data_dir = os.path.join("data/tsv/cpu") data_processor = DataProcessor(data_dir) with open("output/summary.txt", "w") as fw: train_examples = data_processor.get_examples("train") dev_examples = data_processor.get_examples("dev") test_examples = data_processor.get_examples("test") texts = train_examples["text"] + dev_examples["text"] + test_examples[ "text"] length = [len(l.split()) for l in texts] max_len = np.max(length) min_len = np.min(length) median_len = np.median(length) num_words = sum(length) num_train = len(train_examples["text"]) num_dev = len(dev_examples["text"]) num_test = len(test_examples["text"]) num_total = num_train + num_dev + num_test output = "total: %s\ntrain set: %s\ndev set:%s\ntest set:%s\n" \ "number of tokens:%s\nmax len:%s\nmin len:%s\nmedian len:%s\n" % ( num_total, num_train, num_dev, num_test, num_words, max_len, min_len, median_len) print(output) fw.write(output) length = np.array(length) np.save("output/length.npy", length)
def __init__(self, output_dir, data_dir, item): data_dir = os.path.join(data_dir, item) output_dir = os.path.join(output_dir, item) try: predictions = np.load(os.path.join(output_dir, "predictions.npy")) y_pred = np.array(predictions).astype(np.float32) # y_pred = np.argsort(-y_pred, 1).astype(np.int64) except: y_pred = [] try: data_processor = DataProcessor(data_dir) test_examples = data_processor.get_examples("test") labels = test_examples["label"] # labels = [[l] for l in labels] y_true = np.array(labels).astype(np.int64) except: y_true = [] test_examples = {"text": [], "label": []} self.y_true = y_true self.y_preds = y_pred self.test_eamples = test_examples self.num_classes = len(set(y_true))