def test(loader, models, criterion, class_mask, logger, epoch): losses, batch_time, accuracy, task_accuracy = AverageMeter(), AverageMeter( ), AverageMeter(), AverageMeter() with torch.no_grad(): start = time.time() for inputs, labels in loader: # Get outputs inputs, labels = inputs.half().cuda( non_blocking=True), labels.cuda(non_blocking=True) for i, model in enumerate(models): model.eval() outputs = model(inputs) loss = criterion(outputs, labels) losses.update(loss.data, inputs.size(0)) # Measure accuracy and task accuracy prob = torch.nn.functional.softmax(outputs, dim=1) pred = torch.argmax(prob, axis=1).unsqueeze(0) mask_pred = torch.argmax(class_mask[labels] * prob, dim=1).unsqueeze(0) if i == 0: preds = pred mask_preds = mask_pred else: preds = torch.cat((preds, pred), dim=0) mask_preds = torch.cat((mask_preds, mask_pred), dim=0) major_pred = majority_vote(preds) major_mask_pred = majority_vote(mask_preds) accu = (torch.eq(major_pred.cuda(), labels.cuda()) * 1.0).mean() task_accu = (torch.eq(major_mask_pred.cuda(), labels.cuda()) * 1.0).mean() # acc, task_acc = get_accuracy(prob, labels, class_mask) accuracy.update(accu, labels.size(0)) task_accuracy.update(task_accu, labels.size(0)) batch_time.update(time.time() - start) start = time.time() logger.info( '==> Test: [{0}]\tTime:{batch_time.sum:.4f}\tLoss:{losses.avg:.4f}\tAcc:{acc.avg:.4f}\tTask Acc:{task_acc.avg:.4f}\t' .format(epoch, batch_time=batch_time, losses=losses, acc=accuracy, task_acc=task_accuracy)) return accuracy.avg, task_accuracy.avg
def __init__(self, table, header): self.branches = {} self.table = table self.node_type = None self.split_index = None self.leaf_class = None self.header = header classes = [x[-1] for x in table] c_types = u.unique(classes) if len(c_types) == 1: self.node_type = LEAF self.leaf_class = c_types[0] # print("Creating leaf: ") # print(self.table) # print(self.leaf_class) else: self.split_index = max_gain(table, header) if self.split_index != -1: split_vals = u.unique(table, col=self.split_index) self.node_type = SPLIT branch_tabs = [[y for y in table if y[self.split_index] == x] for x in split_vals] for i, bran in enumerate(branch_tabs): self.branches[split_vals[i]] = TreeNode(bran, header) else: self.node_type = LEAF self.leaf_class = u.majority_vote(table)
def evaluate_tpr_indices(models, corpus, topk): """ Evaluate document embeddings with TPR. For each document in `test_set` infer vector and find `k` nearest document (cosine similarity). Take majority vote on the inferred labels. :params: models (dict) : dict (name,model) of gensim.Doc2Vec models to be tested test_set (list) : list of docuemnts in namedtuple format. Thet MUST have at least `words` and `tags` attributes topk (int) : how many documents to predict """ logger.info("Starting evaluation process - TPR@{}\n".format(topk)) tl_list = [doc.tags[0].split('\t')[0] for doc in corpus] unique_names = set(tl_list) label2index = {name: i for i, name in enumerate(unique_names, start=1)} index2lable = {v: k for k, v in label2index.items()} true_labels = np.asarray([label2index[l] for l in tl_list]) labels_count = Counter(true_labels) for model in models: logger.info("\nEvaluating `{}`".format(str(model))) logger.info("Inferring vectors for {} documents".format(len(corpus))) inferred_vectors = [model.infer_vector(doc.words) for doc in corpus] most_similars = [ model.docvecs.most_similar([inf_vec], topn=topk) for inf_vec in inferred_vectors ] all_votes = [[news.split('\t')[0] for (news, score) in ms] for ms in most_similars] predicted = np.asarray( [label2index[majority_vote(votes)[0]] for votes in all_votes]) scores = {} for label, count in labels_count.items(): by_class_pred = np.where(predicted != label, 0, label) hits = np.sum(by_class_pred == true_labels) scores[index2lable[label]] = hits / count logger.info("TPR@10 : {}".format(scores)) logger.info("Average TPR@10 : {}".format( np.sum(predicted == true_labels) / len(true_labels)))
def classify(self, instance): ''' classifies an instance with each tree, then uses simple majority voting to determine class to return ''' predictions = [] for tree in self.forest_list: predictions.append([tree["tree"].classify(instance)]) return utils.majority_vote(predictions)
def classify(self, instance): ''' Given an instance, return the leaf class that the tree classifies it as. ''' if self.node_type == LEAF: return self.leaf_class else: new_att = instance[self.split_index] if new_att in self.branches: return self.branches[new_att].classify(instance) else: return utils.majority_vote(self.table)
def __init__(self, table, header, first=True, full_table=None): self.branches = {} self.table = table self.node_type = None self.split_index = None self.leaf_class = None self.header = header # to initialize, append all ?? to full table if first: full_table = [] for att_index, _ in enumerate(header[:-1]): full_table.append( utils.unique([row[att_index] for row in table])) # get list of all class values classes = [x[-1] for x in table] c_types = utils.unique(classes) # if only one class, add leaf node if len(c_types) == 1: ut = utils.unique_table(self.table) self.node_type = LEAF self.leaf_class = c_types[0] # otherwise, use entropy to determine attribute index to split else: self.split_index = max_gain(table, header) # max_gain returns -1 if there is only one attr value in the current table if self.split_index != -1: # split on index with greatest information gain, then iterate over each attr value split_vals = utils.unique(table, col=self.split_index) self.node_type = SPLIT branch_tabs = [[y for y in table if y[self.split_index] == x] for x in split_vals] for i, branch in enumerate(branch_tabs): self.branches[split_vals[i]] = TreeNode( branch, header, first=False, full_table=full_table) else: # if only one attribute value left, create leaf node self.node_type = LEAF self.leaf_class = utils.majority_vote(table) ut = utils.unique_table(self.table)
def random_forest(N, M, F, table, attr_indexes, attr_domains, class_index, strat_index): random.shuffle(table) test, remainder = test_remainder_stratified(table, strat_index) boot_samples = [] attr_subsets = [] trees = [] accuracies = [] trees = [] #setup boot straps for _ in range(N): attr_subsets.append(utils.rand_attributes(attr_indexes, F)) boot = utils.bootstrap(remainder) valid = [] #build validator set for item in remainder: if item not in boot: valid.append(item) boot_samples.append([boot, valid]) #build trees for i in range(N): #returns predictions, tree pred, tree = train_test_tree(boot_samples[i][0], boot_samples[i][1], attr_subsets[i], attr_domains, class_index) correct = 0 for j in range(len(boot_samples[i][1])): if boot_samples[i][1][j][class_index] == pred[j]: correct += 1 trees.append([tree, utils.div(correct, len(boot_samples[i][1]))]) trees.sort(key=lambda x: x[1]) mtrees = trees[len(trees) - M:] #predict and determine accuracy print(" grouping test set") minutes, groups = utils.groupBy(test, 1) print(" running classifier") accuracies = [] overall_correct = 0 total_instance = len(test) for count in range(len(minutes)): correct = 0 for item in groups[count]: votes = [] for tree in mtrees: votes.append(classify_tdidt(tree[0], item)) vote = utils.majority_vote(votes) if item[class_index] == vote: correct += 1 overall_correct += 1 accuracies.append([ minutes[count], correct / len(groups[count]), correct, len(groups[count]) ]) print("Sorting accuracies") accuracies.sort(key=lambda x: x[0]) count = 0 for item in accuracies: print('Minute: ', item[0]) print(' Accuracy: ', item[1]) print(' Correct: ', item[2]) print(' Instances: ', item[3]) print() count += 1 print("Overll Accurracy: ", overall_correct / total_instance) print("Instances: ", total_instance) print("Correct: ", overall_correct) return accuracies