Exemple #1
0
def get_km(xs, n):
    km = KMedoids(n_cluster=n, max_iter=1000, tol=1e-5)
    km.fit(xs)
    kmidx = list(km.medoids)
    testidx = [i for i in range(len(xs)) if i not in kmidx]
    np.random.shuffle(testidx)
    return kmidx, testidx[:100]
Exemple #2
0
 def __init__(self, corpus, document_titles, num_clusters = None,
              num_features = None):
     """
     Computes the interpreter matrix by calculating the TF-IDF value of each
     token in each concept (doc) in corpus.
     
     document_titles give the names of each concept (doc) in corpus.
     num_features gives the number of features of corpus
     
     If num_clusters == None all documents are used as concepts.
     """
     
     if not num_clusters:
         self.num_clusters = len(document_titles)
     else:
         self.num_clusters = num_clusters
     
     if num_features is None:
         logger.info("scanning corpus to determine the number of features")
         num_features = 1 + utils.get_max_id(corpus)
         
     self.num_features = num_features
     
     #reduce column count by k-medoid clustering and using medoid of each cluster
     #TODO: skip clustering when num_clusters == None
     clusterer = KMedoids(corpus = corpus, 
                          num_features = self.num_features,
                          num_clusters = self.num_clusters,
                          max_iterations = 10)
     clusters = clusterer.cluster()
     
     #set the corpus to medoids
     #the corpus is our interpreter matrix. It is not sparse
     #each column is a doc and is seen as a concept
     self.corpus = clusterer.get_medoids().T
     
     
     #reduce document titles
     self.document_titles = DocumentTitles()
     for cluster_id in clusters.iterkeys():
         self.document_titles.append(document_titles[cluster_id] or "no title")
         
     #print clusters with their members
     for cluster_id, members in clusters.iteritems():
         cluster_title = document_titles[cluster_id]
         member_titles = ", ".join(document_titles[member_id] 
                                   for member_id 
                                   in members)
         logger.debug("%s: %s" % (cluster_title, member_titles))
    def fit(self, X, y, **kwargs):
        if self.verbose:
            print("fitting", self)
        clss = sorted(set(y))
        meds = []
        for c, k in enumerate(clss):
            idxs_c = np.where(y == k)[0]
            kmpp_idxs = kmeanspp(X[idxs_c], self.mk, seed=self.seed)
            kmed = KMedoids(self.mk, init=kmpp_idxs)
            kmed.fit(X[idxs_c], dist=False)
            meds.append(idxs_c[kmed.medoids].tolist())

        self.idxs = np.concatenate(meds, axis=0)
        self.vecs = X[self.idxs]
        cc = Counter(y[self.idxs])
        assert set(cc.values()) == set({self.mk})
        if self.verbose:
            print("fitted KMedoids seed={},".format(self.seed), cc)
        self.clf.fit(self.vecs, y[self.idxs])
        return self
Exemple #4
0
    def test_write_on_file(self):
        K = 10

        # Output population
        pp = get_population()
        fout = open('pp.csv', 'w')
        fout.write('x,y,label\n')
        for x, y in pp:
            fout.write('%f,%f,k\n' % (x, y))
        fout.close()

        # Output MDS samples
        lin = Linear(self.dist_func, K = K)
        lin.update(pp)
        sampled = lin.get_sampled()
        fout = open('mcs.csv', 'w')
        fout.write('x,y,label\n')
        for x, y in sampled:
            fout.write('%f,%f,b\n' % (x, y))
        fout.close()

        # Output random samples
        random.seed(1)
        sampled = random.sample(pp, K)
        fout = open('random.csv', 'w')
        fout.write('x,y,label\n')
        for x, y in sampled:
            fout.write('%f,%f,g\n' % (x, y))
        fout.close()

        # Ouptut k means samples
        kmedoids = KMedoids(self.dist_func, K)
        sampled = kmedoids.sample(pp)
        fout = open('kmedoids.csv', 'w')
        fout.write('x,y,label\n')
        for x, y in sampled:
            fout.write('%f,%f,r\n' % (x, y))
        fout.close()
Exemple #5
0
    def run_kmedoids(self, params):
        """
        Performs clustering using the k-medoids algorithm.

        :type params: dictionary
        :param params: {'k','t_max','init','criterion'}
        """
        self.n_clusters_ = params['k']
        kmedoids = KMedoids(n_clusters=params['k'],
                            max_iter=params['t_max'],
                            init=params['init'],
                            criterion=params['criterion']).fit(self.dist_mat)
        self.centers_l = kmedoids.cluster_centers_
        self.labels_l = kmedoids.labels_
        self.form_kmedoids_results()
Exemple #6
0
        }
    else:
        raise Exception(f"Not recognized dataset: {args.dataset}")

    if args.dis == "euclidean":
        make_pretrainer = lambda: KMeans(n_clusters=n_clusters)
        dis = DMAE.Dissimilarities.euclidean
        dis_loss = DMAE.Losses.euclidean_loss
        init_dmae = lambda pretrainer: {
            "centers": DMAE.Initializers.InitKMeans(pretrainer),
            "mixers": tf.keras.initializers.Constant(1.0)
        }
        cov = False

    elif args.dis == "cosine":
        make_pretrainer = lambda: KMedoids(n_clusters=n_clusters,
                                           metric="cosine")
        dis = DMAE.Dissimilarities.cosine
        dis_loss = DMAE.Losses.cosine_loss
        init_dmae = lambda pretrainer: {
            "centers": DMAE.Initializers.InitKMeans(pretrainer),
            "mixers": tf.keras.initializers.Constant(1.0)
        }
        cov = False

    elif args.dis == "manhattan":
        make_pretrainer = lambda: KMedoids(n_clusters=n_clusters,
                                           metric="manhattan")
        dis = DMAE.Dissimilarities.manhattan
        dis_loss = DMAE.Losses.manhattan_loss
        init_dmae = lambda pretrainer: {
            "centers": DMAE.Initializers.InitKMeans(pretrainer),
Exemple #7
0
def experiments(PORCENTAJE_VECINOS, ALGORITHM, MODELO, normalizar=None):
    vecinos = algorithms[ALGORITHM]

    algoritmos = "coseno"
    if PORCENTAJE_VECINOS in ["boost", "maxsim", "dist"]:
        algoritmos = ALGORITHM + "-" + PORCENTAJE_VECINOS
    elif PORCENTAJE_VECINOS != 0:
        algoritmos = "%s-%.1f" % (ALGORITHM, PORCENTAJE_VECINOS)

    titulo = MODELO + "-" + algoritmos
    if normalizar is not None:
        titulo += "-" + normalizar

    fname = sys.argv[2] + "/" + titulo + ".out"

    if os.path.isfile(fname):
        return

    print(titulo)
    print("-" * 20)

    if PORCENTAJE_VECINOS == 0:
        X = coseno
        if MODELO == "dbscan":
            # Solo sirve para coseno!
            X = 1 - X
    else:
        neighbour_file_name = sys.argv[2] + "/" + ALGORITHM + ".npy"
        if os.path.isfile(neighbour_file_name):
            NEIGHBOURS = np.load(neighbour_file_name)
        else:
            print("Calculando vecinos")
            NEIGHBOURS = np.zeros((len(service_number), len(service_number)))
            for i in range(0, len(service_number)):
                for j in range(i, len(service_number)):
                    NEIGHBOURS[i][j] = vecinos(followers, users, i, j)
                    if i != j:
                        NEIGHBOURS[j][i] = NEIGHBOURS[i][j]
            np.save(neighbour_file_name, NEIGHBOURS)

        if normalizar is not None:
            print("Normalizando Vecinos")
            if normalizar == 'minmax':
                NEIGHBOURS = preprocessing.minmax_scale(NEIGHBOURS)
            elif normalizar == 'scale':
                NEIGHBOURS = preprocessing.scale(NEIGHBOURS)
            elif normalizar == 'robust':
                NEIGHBOURS = preprocessing.robust_scale(NEIGHBOURS)
            elif normalizar == 'softmax':
                NEIGHBOURS = np.exp(NEIGHBOURS) / np.sum(np.exp(NEIGHBOURS), axis=1, keepdims=True)
            elif normalizar == 'matrixminmax':
                NEIGHBOURS = (NEIGHBOURS - np.min(NEIGHBOURS)) / (np.max(NEIGHBOURS) - np.min(NEIGHBOURS))
            elif normalizar == 'matrixmax':
                NEIGHBOURS = NEIGHBOURS / np.max(NEIGHBOURS)
        if MODELO == "dbscan":  # Si es distancia
            if normalizar is not None:
                NEIGHBOURS = 1 - NEIGHBOURS
            else:
                NEIGHBOURS = - NEIGHBOURS
            X = (1 - PORCENTAJE_VECINOS) * (1 - coseno) + PORCENTAJE_VECINOS * NEIGHBOURS
        else:  # Si es afinidad
            if PORCENTAJE_VECINOS == "boost":
                X = np.multiply(coseno, NEIGHBOURS)
            elif PORCENTAJE_VECINOS == "maxsim":
                X = np.maximum(coseno, NEIGHBOURS)
            elif PORCENTAJE_VECINOS == "dist":
                NEIGHBOURS_SORTED = np.argsort(np.argsort(NEIGHBOURS))
                COSINE_SORTED = np.argsort(np.argsort(coseno))
                POS_BOOST = np.log(1 / (1 + np.abs(NEIGHBOURS_SORTED - COSINE_SORTED)))
                X = POS_BOOST
            else:
                X = (1 - PORCENTAJE_VECINOS) * coseno + PORCENTAJE_VECINOS * NEIGHBOURS

    print("Generando Modelo")

    if MODELO == 'kmedoids':
        model = KMedoids(n_clusters=1500).fit(X)
    if MODELO == 'kmedoids470':
        model = KMedoids(n_clusters=470).fit(X)
    elif MODELO == 'ap':
        model = AffinityPropagation(affinity='precomputed').fit(X)
    elif MODELO == 'dbscan':
        model = DBSCAN(metric='precomputed').fit(X)

    labels = model.labels_

    clusters = defaultdict(list)
    for index, classif in enumerate(labels):
        clusters[classif].append(index)

    n_clusters_ = len(clusters)

    info = ""
    info += 'Clusters: %d\n' % n_clusters_
    # info += 'Cohesiveness: %0.3f\n' % cohesiveness(X, labels)
    info += 'Entropy: %0.3f\n' % entropy(labels_true, labels)
    info += "Homogeneity: %0.3f\n" % metrics.homogeneity_score(labels_true, labels)
    info += "Completeness: %0.3f\n" % metrics.completeness_score(labels_true, labels)
    info += "V-measure: %0.3f\n" % metrics.v_measure_score(labels_true, labels)
    info += 'Purity: %0.3f\n' % purity(labels_true, labels)
    info += "F-Measure: %0.3f\n" % fmeasure(labels_true, labels)
    info += "Adjusted Rand Index: %0.3f\n" % metrics.adjusted_rand_score(labels_true, labels)
    info += "Adjusted Mutual Information: %0.3f\n" % metrics.adjusted_mutual_info_score(labels_true, labels)

    clustersize = Counter(labels)

    salida = open(fname, 'w', encoding='UTF-8')

    print(info)

    salida.write(titulo + "\n")
    for cluster, services in clusters.items():
        countcat = Counter([labels_true[svc] for svc in services])
        max_key, num = countcat.most_common(1)[0]
        salida.write("%i (%s - %i/%i): %s \n" % (
            cluster, max_key, num, clustersize[cluster], ",".join([service_list[svc] for svc in services])))
    salida.write("-" * 20 + "\n")
    salida.write(info)
    salida.close()