def find_items_in_clusters(self, hr_tree):
        splits = dict()

        for solutionCluster in range(len(hr_tree)):
            pre_order_visit = hr_tree[solutionCluster].pre_order()

            # getting all clusters below this one (in pre_order)
            splits[solutionCluster] = []
            # for each cluster verify if it is a sub-cluster or a single object, if is a
            # single object (leaf) adds to the output
            # This way the output will have the clusterID mapping to it's objects
            for c in pre_order_visit:
                if hr_tree[c].is_leaf():
                    splits[solutionCluster].append(self.user_id_to_user[c])

            # getting all items of each cluster
            if len(splits[solutionCluster]) > 1:
                for user in splits[solutionCluster]:
                    self.items_cluster.setdefault(
                        solutionCluster, set()).update(
                            self.train_set['items_seen_by_user'][user])

        for cluster in self.items_cluster:
            for item in self.items_cluster[cluster]:
                self.cluster_item_interval.setdefault(cluster,
                                                      {}).update({item: []})
                for user in splits[cluster]:
                    if self.train_set['feedback'][user].get(item, []):
                        self.cluster_item_interval[cluster][item].append(
                            self.train_set['feedback'][user][item])

                self.cluster_item_interval[cluster][item] = \
                    mean_confidence_interval(self.cluster_item_interval[cluster][item], confidence=.95)
    def recommendation_step(self):
        for user in self.test_set['users']:
            user_id = self.user_to_user_id[user]
            bu, hu = mean_confidence_interval(list(
                self.train_set['feedback'][user].values()),
                                              confidence=.95)

            for item in self.test_set['items_seen_by_user'][user]:
                cluster = self.father_of[user_id]
                '''
                mi^k - > media das notas do item em um subconjunto k
                mu -> media das notas do user u 
                * utilizar o h -> a diferenca entra a media e a borda do intervalo (Soh para subir a arvore?)
                
                rui = (wi * mi^k + wu * mu) / (wi + wu) 
                
                '''

                bi = 0
                last_h = float('inf')

                while True:
                    if cluster is None:
                        break

                    if self.cluster_item_interval[cluster].get(item, -1) == -1:
                        cluster = self.father_of[cluster]
                    else:
                        new_h = self.cluster_item_interval[cluster][item][1]

                        if np.isnan(new_h) or new_h == 0:
                            bi = self.cluster_item_interval[cluster][item][0]
                            cluster = self.father_of[cluster]

                        elif new_h < last_h:
                            last_h = new_h
                            bi = self.cluster_item_interval[cluster][item][0]
                            cluster = self.father_of[cluster]

                        else:
                            cluster = self.father_of[cluster]

                if bi == 0:
                    rui = bu
                else:
                    rui = .5 * bu + .5 * bi

                self.predictions.append((user, item, rui))

        self.predictions = sorted(self.predictions, key=lambda x: x[1])

        if self.output_file is not None:
            WriteFile(self.output_file, data=self.predictions,
                      sep=self.sep).write()
    def initialize(self):
        # map users and items
        for i, item in enumerate(self.items):
            self.item_to_item_id.update({item: i})
            self.item_id_to_item.update({i: item})

        # calculate confidence interval
        for u, user in enumerate(self.users):
            self.user_to_user_id.update({user: u})
            self.user_id_to_user.update({u: user})
            self.user_confidence[user] = mean_confidence_interval(
                list(self.train_set['feedback'][user].values()))
Exemple #4
0
def print_means_and_cis(categories, widest_key):
    """Prints the mean and confidence interval for each category.

    """
    for key, values in categories.iteritems():
        pad_width = widest_key - len(key)
        padding = " " * pad_width
        mean, interval = None, None
        if len(values) > 1:
            mean, interval = mean_confidence_interval(values)
            print("%s: %s%.2f +-%.2f" % (key, padding, mean, interval))
        else:
            mean = arith_mean(values)
            print("%s: %s%.2f" % (key, padding, mean))
            sys.stderr.write("Warning: too few samples to calculate confidence"
                             " interval for \"%s\"\n" % key)
Exemple #5
0
def print_means_and_cis(categories, widest_key):
    """Prints the mean and confidence interval for each category.

    """
    for key, values in categories.iteritems():
        pad_width = widest_key - len(key)
        padding = " " * pad_width
        mean, interval = None, None
        if len(values) > 1:
            mean, interval = mean_confidence_interval(values)
            print("%s: %s%.2f +-%.2f" % (key, padding, mean, interval))
        else:
            mean = arith_mean(values)
            print("%s: %s%.2f" % (key, padding, mean))
            sys.stderr.write("Warning: too few samples to calculate confidence"
                             " interval for \"%s\"\n" % key)
def average_calculator(model, k, kwargs, gen_data=True):
    if gen_data:
        generate_data(kwargs)
    p_1 = 0.0
    r_1 = 0.0
    f_1 = 0.0
    f_scores = []
    train_data = ATEDataProcessor(kwargs["train_file"], **kwargs)
    test_data = ATEDataProcessor(kwargs["test_file"],
                                 pos_id=get_count(
                                     train_data.annotated_sentences),
                                 **kwargs)
    for i in range(k):
        print("Run number: {}".format(i))
        test_set = test_data.annotated_sentences
        train_set, dev_set = split(train_data.annotated_sentences,
                                   test_size=kwargs["test_size"])
        train = DataIterator(train_set, **kwargs)
        dev = DataIterator(dev_set, **kwargs)
        test = DataIterator(test_set, **kwargs)
        if model == "lstm":
            model = LSTMNetwork(**kwargs)
        elif model == "cnn":
            model = CNNNetwork(max_sentence_length=train_data.max_sentence_len,
                               **kwargs)
        model.build()
        model.train(train, dev)
        model.restore_session(model.model_directory)
        results = model.evaluate(test)
        f_scores.append(results["f_1"])
        p_1 += float(results["p_1"])
        r_1 += float(results["r_1"])
        f_1 += float(results["f_1"])
        model.close_session()
    print("p_1: {}\nr_1: {}\nf_1: {}".format(p_1 / k, r_1 / k, f_1 / k))
    print(mean_confidence_interval(f_scores))
    return {"precision": p_1 / k, "recall": r_1 / k, "fscore": f_1 / k}