Esempio n. 1
0
def test(loader, models, criterion, class_mask, logger, epoch):
    losses, batch_time, accuracy, task_accuracy = AverageMeter(), AverageMeter(
    ), AverageMeter(), AverageMeter()

    with torch.no_grad():
        start = time.time()
        for inputs, labels in loader:
            # Get outputs
            inputs, labels = inputs.half().cuda(
                non_blocking=True), labels.cuda(non_blocking=True)

            for i, model in enumerate(models):
                model.eval()
                outputs = model(inputs)

                loss = criterion(outputs, labels)
                losses.update(loss.data, inputs.size(0))

                # Measure accuracy and task accuracy

                prob = torch.nn.functional.softmax(outputs, dim=1)
                pred = torch.argmax(prob, axis=1).unsqueeze(0)
                mask_pred = torch.argmax(class_mask[labels] * prob,
                                         dim=1).unsqueeze(0)

                if i == 0:
                    preds = pred
                    mask_preds = mask_pred
                else:
                    preds = torch.cat((preds, pred), dim=0)
                    mask_preds = torch.cat((mask_preds, mask_pred), dim=0)

            major_pred = majority_vote(preds)
            major_mask_pred = majority_vote(mask_preds)

            accu = (torch.eq(major_pred.cuda(), labels.cuda()) * 1.0).mean()
            task_accu = (torch.eq(major_mask_pred.cuda(), labels.cuda()) *
                         1.0).mean()

            # acc, task_acc = get_accuracy(prob, labels, class_mask)
            accuracy.update(accu, labels.size(0))
            task_accuracy.update(task_accu, labels.size(0))
            batch_time.update(time.time() - start)
            start = time.time()

    logger.info(
        '==> Test: [{0}]\tTime:{batch_time.sum:.4f}\tLoss:{losses.avg:.4f}\tAcc:{acc.avg:.4f}\tTask Acc:{task_acc.avg:.4f}\t'
        .format(epoch,
                batch_time=batch_time,
                losses=losses,
                acc=accuracy,
                task_acc=task_accuracy))
    return accuracy.avg, task_accuracy.avg
    def __init__(self, table, header):
        self.branches = {}
        self.table = table
        self.node_type = None
        self.split_index = None
        self.leaf_class = None
        self.header = header

        classes = [x[-1] for x in table]
        c_types = u.unique(classes)

        if len(c_types) == 1:
            self.node_type = LEAF
            self.leaf_class = c_types[0]
            # print("Creating leaf: ")
            # print(self.table)
            # print(self.leaf_class)

        else:
            self.split_index = max_gain(table, header)
            if self.split_index != -1:
                split_vals = u.unique(table, col=self.split_index)
                self.node_type = SPLIT
                branch_tabs = [[y for y in table if y[self.split_index] == x]
                               for x in split_vals]
                for i, bran in enumerate(branch_tabs):
                    self.branches[split_vals[i]] = TreeNode(bran, header)
            else:
                self.node_type = LEAF
                self.leaf_class = u.majority_vote(table)
Esempio n. 3
0
def evaluate_tpr_indices(models, corpus, topk):
    """
  Evaluate document embeddings with TPR. For each document in `test_set` infer vector and find `k` nearest document (cosine similarity).
  Take majority vote on the inferred labels.
  
  :params:
    
    models (dict) : dict (name,model) of gensim.Doc2Vec models to be tested
    test_set (list) : list of docuemnts in namedtuple format. Thet MUST have at least `words` and `tags` attributes
    topk (int) : how many documents to predict
  
  """

    logger.info("Starting evaluation process - TPR@{}\n".format(topk))

    tl_list = [doc.tags[0].split('\t')[0] for doc in corpus]

    unique_names = set(tl_list)

    label2index = {name: i for i, name in enumerate(unique_names, start=1)}

    index2lable = {v: k for k, v in label2index.items()}

    true_labels = np.asarray([label2index[l] for l in tl_list])

    labels_count = Counter(true_labels)

    for model in models:

        logger.info("\nEvaluating `{}`".format(str(model)))

        logger.info("Inferring vectors for {} documents".format(len(corpus)))

        inferred_vectors = [model.infer_vector(doc.words) for doc in corpus]

        most_similars = [
            model.docvecs.most_similar([inf_vec], topn=topk)
            for inf_vec in inferred_vectors
        ]

        all_votes = [[news.split('\t')[0] for (news, score) in ms]
                     for ms in most_similars]

        predicted = np.asarray(
            [label2index[majority_vote(votes)[0]] for votes in all_votes])

        scores = {}

        for label, count in labels_count.items():

            by_class_pred = np.where(predicted != label, 0, label)

            hits = np.sum(by_class_pred == true_labels)

            scores[index2lable[label]] = hits / count

        logger.info("TPR@10 : {}".format(scores))

        logger.info("Average TPR@10 : {}".format(
            np.sum(predicted == true_labels) / len(true_labels)))
Esempio n. 4
0
    def classify(self, instance):
        '''
        classifies an instance with each tree, then uses simple majority voting
        to determine class to return
        '''
        predictions = []
        for tree in self.forest_list:
            predictions.append([tree["tree"].classify(instance)])

        return utils.majority_vote(predictions)
Esempio n. 5
0
 def classify(self, instance):
     '''
         Given an instance, return the leaf class that the tree
         classifies it as.
     '''
     if self.node_type == LEAF:
         return self.leaf_class
     else:
         new_att = instance[self.split_index]
         if new_att in self.branches:
             return self.branches[new_att].classify(instance)
         else:
             return utils.majority_vote(self.table)
Esempio n. 6
0
    def __init__(self, table, header, first=True, full_table=None):
        self.branches = {}
        self.table = table
        self.node_type = None
        self.split_index = None
        self.leaf_class = None
        self.header = header

        # to initialize, append all ?? to full table
        if first:
            full_table = []
            for att_index, _ in enumerate(header[:-1]):
                full_table.append(
                    utils.unique([row[att_index] for row in table]))

        # get list of all class values
        classes = [x[-1] for x in table]
        c_types = utils.unique(classes)

        # if only one class, add leaf node
        if len(c_types) == 1:
            ut = utils.unique_table(self.table)
            self.node_type = LEAF
            self.leaf_class = c_types[0]

        # otherwise, use entropy to determine attribute index to split
        else:
            self.split_index = max_gain(table, header)
            # max_gain returns -1 if there is only one attr value in the current table

            if self.split_index != -1:
                # split on index with greatest information gain, then iterate over each attr value
                split_vals = utils.unique(table, col=self.split_index)
                self.node_type = SPLIT
                branch_tabs = [[y for y in table if y[self.split_index] == x]
                               for x in split_vals]
                for i, branch in enumerate(branch_tabs):
                    self.branches[split_vals[i]] = TreeNode(
                        branch, header, first=False, full_table=full_table)
            else:
                # if only one attribute value left, create leaf node
                self.node_type = LEAF
                self.leaf_class = utils.majority_vote(table)
                ut = utils.unique_table(self.table)
Esempio n. 7
0
def random_forest(N, M, F, table, attr_indexes, attr_domains, class_index,
                  strat_index):
    random.shuffle(table)
    test, remainder = test_remainder_stratified(table, strat_index)
    boot_samples = []
    attr_subsets = []
    trees = []
    accuracies = []
    trees = []
    #setup boot straps
    for _ in range(N):
        attr_subsets.append(utils.rand_attributes(attr_indexes, F))
        boot = utils.bootstrap(remainder)
        valid = []
        #build validator set
        for item in remainder:
            if item not in boot:
                valid.append(item)
        boot_samples.append([boot, valid])

    #build trees
    for i in range(N):
        #returns predictions, tree
        pred, tree = train_test_tree(boot_samples[i][0], boot_samples[i][1],
                                     attr_subsets[i], attr_domains,
                                     class_index)
        correct = 0
        for j in range(len(boot_samples[i][1])):
            if boot_samples[i][1][j][class_index] == pred[j]:
                correct += 1
        trees.append([tree, utils.div(correct, len(boot_samples[i][1]))])

    trees.sort(key=lambda x: x[1])
    mtrees = trees[len(trees) - M:]

    #predict and determine accuracy
    print("     grouping test set")
    minutes, groups = utils.groupBy(test, 1)
    print("     running classifier")
    accuracies = []
    overall_correct = 0
    total_instance = len(test)

    for count in range(len(minutes)):
        correct = 0
        for item in groups[count]:
            votes = []
            for tree in mtrees:
                votes.append(classify_tdidt(tree[0], item))
            vote = utils.majority_vote(votes)
            if item[class_index] == vote:
                correct += 1
                overall_correct += 1
        accuracies.append([
            minutes[count], correct / len(groups[count]), correct,
            len(groups[count])
        ])

    print("Sorting accuracies")
    accuracies.sort(key=lambda x: x[0])
    count = 0
    for item in accuracies:
        print('Minute: ', item[0])
        print('     Accuracy: ', item[1])
        print('     Correct: ', item[2])
        print('     Instances: ', item[3])
        print()
        count += 1
    print("Overll Accurracy: ", overall_correct / total_instance)
    print("Instances: ", total_instance)
    print("Correct: ", overall_correct)

    return accuracies