Esempio n. 1
0
 def __init__(self, entity_pairs, eps=0.5, min_samples=5, metric='euclidean', algorithm='auto'):
     self.vm = VectorModel()
     self.cluster = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm)
     self.entity_pairs = entity_pairs
     self.X, self.relation_index = self.get_relation_vecs()
     print self.X.shape
     self.cluster.fit(self.X)
     self.labels = self.cluster.labels_
Esempio n. 2
0
class RelationCluster():
    
    '''
    def __init__(self, entity_set_list, eps=0.7, min_samples=3, metric='euclidean'):
        self.vm = VectorModel()
        self.cluster = DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
        self.entity_set_list = entity_set_list
        self.X, self.relation_index = self.get_relation_vecs()
        self.cluster.fit(self.X)
        self.labels = self.cluster.labels_
    '''

    def __init__(self, entity_pairs, eps=0.5, min_samples=5, metric='euclidean', algorithm='auto'):
        self.vm = VectorModel()
        self.cluster = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm)
        self.entity_pairs = entity_pairs
        self.X, self.relation_index = self.get_relation_vecs()
        print self.X.shape
        self.cluster.fit(self.X)
        self.labels = self.cluster.labels_

    def get_relation_vec(self, pair):
        # Gets the difference between the word vector for w1 and the word vector for w2
        w1, w2 = pair
        return self.vm.vector(w1) - self.vm.vector(w2)
    '''
    def get_relation_vecs(self):
        # Gets the relation vectors between every pair in the set
        relation_index = {}
        relation_list = []
        index = 0
        for entity_set in self.entity_set_list:
            for entity1 in entity_set:
                for entity2 in entity_set:
                    if entity1 == entity2:
                        continue
                    if (entity1, entity2) in relation_index:
                        # the oredered pair has already been seen
                        continue
                    relation_index[(entity1, entity2)] = index
                    relation_index[index] = (entity1, entity2)
                    relation_list.append(self.get_relation_vec(entity1, entity2))
                    index += 1
        X = np.array(relation_list)
        return X, relation_index
    '''

    def get_relation_vecs(self):
        # Gets the relation vectors between every pair in the set
        relation_index = {}
        relation_list = []
        index = 0
        for pair in self.entity_pairs:
            relation_index[pair] = index
            relation_index[index] = pair
            relation_list.append(self.get_relation_vec(pair))
            index += 1
        X = np.array(relation_list)
        return X, relation_index

    def get_num_clusters(self):
        # Returns the number of clusters found
        # The set of relation vectors that do not belong in any cluster 
        # does not count as a cluster
        return len(set(self.labels)) - (1 if -1 in self.labels else 0)
    
    def get_unique_labels(self):
        # Returns the set of unique labels, or the index of each cluster
        return set(self.labels)

    def get_label(self, pair):
        # Returns the label of the cluster that a pair's relation vector belongs in
        return self.labels[self.relation_index[pair]]

    def is_core(self, pair):
        # Returns whether a pair's relation vector is a core component
        return self.relation_index[pair] in self.cluster.core_sample_indices_

    def is_clustered(self, pair):
        # Returns whether a pair's relation vector is part of a cluster
        return self.labels[self.relation_index[pair]] != -1

    def get_cluster_sizes(self):
        # Returns a dictionary mapping cluster labels to the cluster sizes
        counts = {}
        label_list = list(self.labels)
        for label in self.get_unique_labels():
            counts[label] = label_list.count(label)
        return counts

    def get_vector_clusters(self):
        # Returns a tuple of two dictionaries
        # The first dictionary maps labels to lists of relation vectors
        # The second dictionary maps labels to the indices of the relation vectors that are
        # in the cluster of that label
        clusters = collections.defaultdict(list)
        index_clusters = collections.defaultdict(list)
        core_indices = self.cluster.core_sample_indices_
        for i in xrange(len(self.labels)):
            if self.labels[i] == -1:
                continue
            clusters[self.labels[i]].append(self.X[i])
            index_clusters[self.labels[i]].append(i)
        '''
        for label in self.get_unique_labels():
            for index in core_indices:
                if self.labels[index] == label:
                    index_clusters[label].append(index)
        '''
        print len([i for i in self.labels if i != -1])
        print index_clusters
        return clusters, index_clusters

    def get_vector_cluster_means(self):
        clusters, index_clusters = self.get_vector_clusters()
        means = {}
        for label in self.get_unique_labels():
            cluster = np.array(clusters[label])
            mu = cluster.mean(axis=0)
            means[label] = mu
        return means

    def get_vector_cluster_variances(self):
        # finds the variance from the mean 
        pass

    def print_clusters(self):
        if len(self.cluster.core_sample_indices_) == 0:
            print "NO CLUSTERS :("
            return
        clusters, index_clusters = self.get_vector_clusters()
        cluster_sizes = self.get_cluster_sizes()
        #cluster_vector_means = self.get_vector_cluster_means()
        for label in self.get_unique_labels():
            if label == -1:
                continue
            print "Cluster: %d" % label
            print "cluster size: %d" % cluster_sizes[label]
            print "indices in cluster: ", index_clusters[label]
            #mean = cluster_vector_means[label]
            for index in index_clusters[label]:
                print '\t' + str(self.relation_index[index])
                '''