Ejemplo n.º 1
0
def spectral_clustering(A, nb_clusters, laplacian_normalization = None, algo = None):
    """
    Compute the clusters assignement from spectral clustering algorithm
    steps :
    * Compute laplacian
    * Compute k smaller eigenvalues and associated eigenvectors
    * Train a kmean on this vectors
    * Apply this kmean to the Laplacian
    """
    if algo not in ['sph', None]:
        raise Exception('Algorithm {} unknown'.format(algo))

    L = get_laplacian(A, laplacian_normalization)
    L = scipy.sparse.csr_matrix(L, dtype=np.float64)
    v, w = eigsh(L, nb_clusters, which='SM')

    if algo == None :
        km = KMeans(n_clusters= nb_clusters)
        km.fit(np.transpose(w))
        clusters = km.predict(L)

    elif algo == 'sph':
        clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
        cluster = clusterer.cluster(np.transpose(w), True)
        vectors = [np.transpose(L[i, :].toarray()[0]) for i in range(0, L.shape[1])]
        clusters = [clusterer.classify(vector) for vector in vectors]
    return clusters
def nltk_clustering(n, filename):
    global vectors
    global names
    global repeats
    # Clustering
    print("Begin clustering, n = {:d}...".format(n))

    clusterer = KMeansClusterer(n, cosine_distance, repeats=repeats)
    clustered = clusterer.cluster(vectors, assign_clusters=True, trace=False)
    clustered = np.array(clustered)

    index = sorted(clustered)
    # print(clustered.argsort())
    names = list(names[clustered.argsort()])

    # write result to file
    print("Saving result to file...")
    output = filename[:-4] + "_" + str(n) + "_clustered.txt"
    with open(output, "w") as f:
        current_idx = None
        for itr, idx in zip(names, index):
            if current_idx != idx:
                current_idx = idx
                f.write("\nCluster {:d} (description: )\n".format(current_idx))
            else:
                pass
            f.write(itr + "\n")
    #
    print("Clustered result saved in {0}".format(output))
Ejemplo n.º 3
0
def clusterer_nltk_kmeans(X, n_clusters):
    # "_args": [{"type": "numpy.ndarray","dtype": "float32"} ],
    #   "_return": [{ "type": "numpy.ndarray","dtype": "int32"}

    # in this case we want to try different numbers of clusters, so it is a parameter
    import nltk
    import numpy as np
    from nltk.cluster.kmeans import KMeansClusterer
    print('clusterer_nltk_kmeans')

    clusterAlgLabelAssignmentsNK = None
    # X = XY[0]
    cmtVectors = X  # XY[1]
    if type(cmtVectors) is np.ndarray and len(cmtVectors) > 0:
        # dt = np.dtype(cmtVectors)
        dt = cmtVectors.dtype
        if dt.type is np.float32 or dt.type is np.float64:
            clusterAlgNK = KMeansClusterer(
                params['n_clusters'],
                distance=nltk.cluster.util.cosine_distance,
                repeats=25,
                avoid_empty_clusters=True)
            clusterAlgLabelAssignmentsNK = clusterAlgNK.cluster(
                cmtVectors, assign_clusters=True)

    XY = (X, clusterAlgLabelAssignmentsNK)
    return XY
Ejemplo n.º 4
0
def nltk_manhattan_kmeans(encoded_img):
    from scipy.spatial.distance import cityblock
    from nltk.cluster.kmeans import KMeansClusterer

    kclusterer = KMeansClusterer(2, distance=cityblock, repeats=10)
    assigned_clusters = kclusterer.cluster(encoded_img, assign_clusters=True)

    print_labels(assigned_clusters)
Ejemplo n.º 5
0
def nltk_euclidean_kmeans(encoded_img):
    from nltk.cluster.util import euclidean_distance
    from nltk.cluster.kmeans import KMeansClusterer

    kclusterer = KMeansClusterer(2, distance=euclidean_distance, repeats=10)
    assigned_clusters = kclusterer.cluster(encoded_img, assign_clusters=True)

    print_labels(assigned_clusters)
Ejemplo n.º 6
0
def spherical_clustering_from_adjency(A, nb_clusters):
    """
    Spectral clustering with spherical kmeans
    """
    A = scipy.sparse.csr_matrix(A, dtype=np.float64)
    v, w = eigsh(A, nb_clusters, which='LM')
    clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
    cluster = clusterer.cluster(np.transpose(w), True)
    vectors = [np.transpose(A[i, :].toarray()[0]) for i in range(0, A.shape[1])]
    clusters = [clusterer.classify(vector) for vector in vectors]
    return clusters
Ejemplo n.º 7
0
def get_cluster(tfidf_arr, k):
    """
    K-means聚类
    :param tfidf_arr:
    :param k:
    :return:
    """
    kmeans = KMeansClusterer(num_means=k,
                             distance=cosine_distance,
                             avoid_empty_clusters=True)  # 分成k类,使用余弦相似分析
    kmeans.cluster(tfidf_arr)

    # 获取分类
    kinds = pd.Series([kmeans.classify(i) for i in tfidf_arr])
    fw = open('/you_filed_algos/prod_kudu_data/ClusterText.txt',
              'a+',
              encoding='utf-8')
    for i, v in kinds.items():
        fw.write(str(i) + '\t' + str(v) + '\n')
    fw.close()
Ejemplo n.º 8
0
 def Kmeans(self, volcabulary, vectors, n_cluster):
     """K-means clustering based on cosine similarity of word2vec.
     """
     kclusterer = KMeansClusterer(
         n_cluster,
         distance=nltk.cluster.util.cosine_distance,
         repeats=10,
         avoid_empty_clusters=True)
     assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)
     dic = defaultdict(list)
     for c, w in zip(assigned_clusters, volcabulary):
         dic[c].append(w)
     return assigned_clusters, dic
Ejemplo n.º 9
0
def new_cluster(filepath):
    NUM_CLUSTERS = 4
    data = get_data(filepath)
    kclusterer = KMeansClusterer(NUM_CLUSTERS,
                                 distance=lambda a, b: np.max(a - b),
                                 repeats=1000)
    labels = kclusterer.cluster(data, assign_clusters=True)
    print("Showing the cluster results")
    for id in range(NUM_CLUSTERS):
        for i in range(len(data)):
            if labels[i] == id:
                print("Joint : ", i + 1, " Joint Values: ", data[i],
                      " Cluster Id: ", id)
def ClusterItems(data_file, items_bias_file, index_file, clusters_file,
                 centroids_file):

    data = np.genfromtxt(data_file)
    popular_items = np.genfromtxt(index_file).astype('int')
    data = data[popular_items]
    items_bias = np.genfromtxt(items_bias_file)
    important_items = np.where(np.abs(items_bias[popular_items]) < 0.2)[0]
    kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=cosine_distance)
    print(NUM_CLUSTERS, important_items.shape)
    print("end", data.shape)
    clusters = kclusterer.cluster(data[important_items], assign_clusters=True)
    np.savetxt(centroids_file, kclusterer.means())
    np.savetxt(clusters_file, clusters)
def main():
    getFiles()
    tf_idf()
    num_clusters = int(sys.argv[2])
    kclusterer = KMeansClusterer(num_clusters,
                                 distance=nltk.cluster.util.cosine_distance,
                                 repeats=25)
    assigned_clusters = kclusterer.cluster(wordvec, assign_clusters=True)
    clustersDict = {}
    for i in range(num_clusters):
        clustersDict[i] = []
    for i in range(len(assigned_clusters)):
        clustersDict[assigned_clusters[i]].append(fileList[i])
    printClustersInFormat(clustersDict)
Ejemplo n.º 12
0
class KMeansClusters(BaseEstimator, TransformerMixin):
    def __init__(self, k=7) -> None:

        self.k = k
        self.distance = nltk.cluster.cosine_distance
        self.model = KMeansClusterer(self.k,
                                     self.distance,
                                     avoid_empty_clusters=True)

    def fit(self, data, labels=None):
        return self

    def transform(self, data):
        return self.model.cluster(data, assign_clusters=True)
Ejemplo n.º 13
0
def cluster(folderName, vectorsize, clusterType):
    corpus = loadXES.get_doc_XES_tagged(folderName + '.xes')
    print('Data Loading finished, ', str(len(corpus)), ' traces found.')

    model = gensim.models.Doc2Vec.load('output/' + folderName + 'T2VVS' +
                                       str(vectorsize) + '.model')

    vectors = []
    NUM_CLUSTERS = 5
    print("inferring vectors")
    for doc_id in range(len(corpus)):
        inferred_vector = model.infer_vector(corpus[doc_id].words)
        vectors.append(inferred_vector)
    print("done")

    if (clusterType == "KMeans"):
        kclusterer = KMeansClusterer(
            NUM_CLUSTERS,
            distance=nltk.cluster.util.cosine_distance,
            repeats=25)
        assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)
    elif (clusterType == "HierWard"):
        ward = AgglomerativeClustering(n_clusters=NUM_CLUSTERS,
                                       linkage='ward').fit(vectors)
        assigned_clusters = ward.labels_
    elif clusterType == "OCSVM":
        ocsvm = OneClassSVM()
        assigned_clusters = ocsvm.fit_predict(vectors)

    else:
        print(
            clusterType,
            " is not a predefined cluster type. Please use 'KMeans' or 'HierWard', or create a definition for ",
            clusterType)
        return
    trace_list = loadXES.get_trace_names(folderName + ".xes")
    clusterResult = {}
    for doc_id in range(len(corpus)):
        clusterResult[trace_list[doc_id]] = assigned_clusters[doc_id]

    resultFile = open(
        'output/' + folderName + 'T2VVS' + str(vectorsize) + clusterType +
        '.csv', 'w')
    for doc_id in range(len(corpus)):
        resultFile.write(trace_list[doc_id] + ',' +
                         str(assigned_clusters[doc_id]) + "\n")

    resultFile.close()
    print("done with ", clusterType, " on event log ", folderName)
Ejemplo n.º 14
0
def cluster(clusterType, vectors, y):
    if (clusterType == "KMeans"):
        kclusterer = KMeansClusterer(
            NUM_CLUSTERS,
            distance=nltk.cluster.util.cosine_distance,
            repeats=25)
        assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)

    elif (clusterType == "GMM"):
        GMM = GaussianMixture(n_components=NUM_CLUSTERS)
        assigned_clusters = GMM.fit_predict(vectors)

    elif (clusterType == "SVM"):
        classifier = SVC(kernel='rbf', gamma='auto', random_state=0)
        #cross-validation
        assigned_clusters = cross_validation(classifier, vectors, y)

    elif (clusterType == "T2VH"):
        ret = hierarchical.ward_tree(vectors, n_clusters=NUM_CLUSTERS)
        children = ret[0]
        n_leaves = ret[2]
        assigned_clusters = hierarchical._hc_cut(NUM_CLUSTERS, children,
                                                 n_leaves)

    elif (clusterType == "RandomForest"):
        classifier = RandomForestClassifier()
        #cross-validation
        assigned_clusters = cross_validation(classifier, vectors, y)
        # classifier.fit(vectors, y)
        # assigned_clusters=classifier.predict(vectors)

    elif (clusterType == "DecisionTree"):
        classifier = DecisionTreeClassifier()
        #cross-validation
        assigned_clusters = cross_validation(classifier, vectors, y)
        # classifier.fit(vectors, y)
        # assigned_clusters=classifier.predict(vectors)

    elif (clusterType == "LogisticRegression"):
        classifier = sklearn.linear_model.LogisticRegression()
        #cross-validation
        assigned_clusters = cross_validation(classifier, vectors, y)
        # classifier.fit(vectors, y)
        # assigned_clusters=classifier.predict(vectors)

    else:
        print(clusterType, " is not a predefined cluster type.")
        return
    return assigned_clusters
Ejemplo n.º 15
0
def clustering(dataframe, repeats, myStopwords):
    num_clusters = 5
    # define vectorizer parameters
    tfidf_vectorizer = TfidfVectorizer(stop_words=myStopwords)
    # Only process the content, not the title
    tfidf_matrix = tfidf_vectorizer.fit_transform(dataframe["Content"])
    # Convert it to an array
    tfidf_matrix_array = tfidf_matrix.toarray()
    # Run K-means with cosine distance as the metric
    kclusterer = KMeansClusterer(num_clusters,
                                 distance=cosine_distance,
                                 repeats=repeats)
    # Output to assigned_clusters
    assigned_clusters = kclusterer.cluster(tfidf_matrix_array,
                                           assign_clusters=True)
    # cluster_size counts how many elements each cluster contains
    cluster_size = [0, 0, 0, 0, 0]
    # Create a 5x5 array and fill it with zeros
    matrix = [[0 for x in range(5)] for y in range(5)]
    # For every category
    for category in categories:
        # For every article
        for row in range(0, len(assigned_clusters)):
            # Compare the cluster number with the category number
            if assigned_clusters[row] == categories.index(category):
                ind = categories.index(dataframe.ix[row][4])
                matrix[categories.index(category)][ind] += 1
    # Count how many elements each cluster contains
    for row in range(0, len(assigned_clusters)):
        cluster_size[assigned_clusters[row]] += 1
    for x in range(5):
        for y in range(5):
            # Calculate frequency
            matrix[x][y] /= cluster_size[x]
            # Only keep the 2 first decimal digits
            matrix[x][y] = format(matrix[x][y], '.2f')
    # Output to a .csv file
    out_file = open("output/clustering_KMeans.csv", 'w')
    wr = csv.writer(out_file, delimiter="\t")
    newCategories = categories
    newCategories.insert(0, "\t")
    wr.writerow(newCategories)
    for x in range(5):
        newMatrix = matrix[x]
        clusterName = "Cluster " + str(x + 1)
        newMatrix.insert(0, clusterName)
        wr.writerow(matrix[x])
Ejemplo n.º 16
0
def recluster(df, cl, clusters, n_clusters):
    lbls = cl.labels_
    mask = np.array([False for i in range(len(lbls))])
    for c in clusters:
        mask |= lbls==c
    subpipe, results = data_pipeline(df[mask])
    
    ##use cosine similarity! NLTK clustering implementation
    #KMeans cluster object as carrier for consistency
    subcl = cluster(results, n_clusters)
    kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=50)
    assigned_clusters = kclusterer.cluster(results, assign_clusters=True)
    #assign new cluster labels and cluster centroids
    subcl.labels_ = np.array(assigned_clusters)
    subcl.cluster_centers_ = np.array(kclusterer.means())
    
    return subpipe, subcl, results, df[mask]
Ejemplo n.º 17
0
    def cluster_docs(self):

        vectors = []
        used_lines = []

        for doc, id in self.es_docs():
            tokens = text_cleaner.clean_tokens(doc)
            if tokens != 'NC' and len(tokens) > 200:
                used_lines.append(tokens)
                vectors.append(self.model.infer_vector(tokens))

        kclusterer = KMeansClusterer(
            NUM_CLUSTERS,
            distance=nltk.cluster.util.cosine_distance,
            repeats=25)
        assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)

        print("done")
Ejemplo n.º 18
0
class kmeans_cosine(object):
    def __init__(self,k):
        self.k = k
        self.model = KMeansClusterer(k, distance=nltk.cluster.util.cosine_distance, repeats=25)

    def build(self,X,p):
        """
        """
        data = scipy.sparse.csr_matrix(X).toarray()
        kclusters= np.array(self.model.cluster(data, assign_clusters=True))
        prediction = self.model.classify(p)
        cluster_id = kclusters == prediction
        return cluster_id, prediction

    def save(self, filename = "model2.pkl"):
        """
        """
        with open(filename, 'w') as f:
            pickle.dump(self.model, f)
Ejemplo n.º 19
0
# create counter and idf vectors

count_vect = TfidfVectorizer    (stop_words=stop_words)
count_vect.fit(df['Content']) #12266
X_train_counts = count_vect.transform(df['Content']

# reduce size of vector with LSI

svd = TruncatedSVD(n_components=5)
X_train_counts = svd.fit_transform(X_train_counts)


# Clustering

kclusterer = KMeansClusterer(num_means = 5, distance=cosine_distance, repeats=25, avoid_empty_clusters= True)
clusters = kclusterer.cluster(X_train_counts, assign_clusters=True)
# print "Clusters:\n " , clusters
# print "Means" , kclusterer.means()


# Prepare results Matrix

categories_map={
'Politics': 0,
'Business': 1,
'Film': 2,
'Technology': 3,
'Football': 4
}

labels_map={
# Part 1 - Compute Kmeans using Cosine distance with 5 clusters

# Use KMeans Clusterer from Natural Language Toolkit Package as this allows Cosine distance to be used as distance measure

# Import packages from NLTK

from nltk.cluster.kmeans import KMeansClusterer
from nltk.cluster.util import cosine_distance

# In[65]:

# Perform clustering using Cosine distance
km_cos = KMeansClusterer(5,
                         distance=cosine_distance,
                         avoid_empty_clusters=True)
km_cos_cl = km_cos.cluster(X, assign_clusters=True)

# In[103]:

# Part 2 - Calculate Adjusted Rand Score and MI Score
print("Adjusted Rand Score (Cosine Distance): " +
      str(met.adjusted_rand_score(true_Labels, km_cos_cl)))
print("\nAdjusted Mutual Information Score (Cosine Distance): " +
      str(met.adjusted_mutual_info_score(true_Labels, km_cos_cl)))

# Clustering using Cosine distance has resulted in far better results on both the Adjusted Rand and Adjusted MI scores.  As with Euclidean distance, attempting clustering after performing PCA resulted in reduced scores on both indices (see Appendix).

# In[222]:

# Part 3 - Run 50 random initializations
Ejemplo n.º 21
0
    lines = open(datacfg).readlines()
    images = []
    for line in lines:
        if (line.split(' ')[0] == 'train'):
            valid_path = line.strip().split(' ')[-1]
            if (valid_path[0] != '/'):
                valid_path = workspace + valid_path
            lists = open(valid_path).readlines()
            images = [x.strip() for x in lists]

    bboxes = []
    for image in images:
        label = image.replace('.jpg', '.txt')
        lines = open(label).readlines()
        for line in lines:
            splitline = line.split(' ')
            # bboxes.append([float(x)*13. for x in splitline[-2:]])
            bboxes.append([float(splitline[-2])*1., float(splitline[-1])*1.])
    print(len(bboxes))
    # samples = random.sample(bboxes, 15000)
    # print(len(samples))
    bboxes = np.array(bboxes)
    # samples = np.array(samples)
    # print(samples.shape)

    KMeans = KMeansClusterer(5, negIoU, repeats=1)
    # clusters = KMeans.cluster(samples, True)
    clusters = KMeans.cluster(bboxes, True)
    centroids = KMeans.means()
    print(np.array(centroids) / np.array((1., 1.)))
Ejemplo n.º 22
0
        if score < 0.7:
            break
        try:
            arr = numpy.append(arr,
                               numpy.reshape(model.wv.word_vec(phrase),
                                             (1, 100)),
                               axis=0)
        except KeyError:
            pass
        else:
            embedded_phrases.append(phrase)

    print('number of sample points:', len(embedded_phrases))

    kmeans = KMeansClusterer(6, nltk.cluster.util.cosine_distance)
    clusters = kmeans.cluster(arr, assign_clusters=True)
    centers = kmeans.means()

    result = {0: [], 1: [], 2: [], 3: [], 4: [], 5: []}
    for i in range(len(clusters)):
        result[clusters[i]].append([
            nltk.cluster.util.cosine_distance(centers[clusters[i]], arr[i]),
            embedded_phrases[i]
        ])
    for k in result:
        sorted_result = sorted(result[k], reverse=True)
        final_result = '\n'.join(
            ['%.10f' % x[0] + '\t' + x[1] for x in sorted_result])
        f = open('cluster' + str(k) + '.txt', 'w+')
        f.write(final_result)
Ejemplo n.º 23
0
from nltk.cluster.util import cosine_distance
from sklearn import manifold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from assignment_2.data.data_reader import get_speeches
from assignment_2.data.president import Trump

speeches = get_speeches()

speeches_sum = speeches.groupby(['president'])[['script']].sum()
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(speeches_sum.script).toarray()
tfidf_matrix = tfidf.fit_transform(speeches_sum.script).toarray()
kmeans = KMeansClusterer(27, distance=cosine_distance, repeats=25)
clusters = kmeans.cluster(X, assign_clusters=True)

speakers = speeches_sum.index.tolist()

cluster_df = pd.DataFrame(speakers, clusters)
cluster_df = cluster_df.reset_index()
cluster_df.columns = ['president_index', 'president']
print(clusters)

trump_index = cluster_df[cluster_df['president'] ==
                         Trump]['president_index'].iloc[0]

similar_speakers = [
    speakers[i] for i, c in enumerate(clusters)
    if (c == trump_index) and (i != 8)
]
Ejemplo n.º 24
0
# #Plot the clusters obtained using k means
# fig = plt.figure()
# ax = fig.add_subplot(111)

# scatter = ax.scatter(big_data_copy['Accounting'],big_data_copy['3D Printing'],
#                       c=kmeans[0],s=50)

# plt.colorbar(scatter)

# this one is not working out...dataframe might not be correct format
NUM_CLUSTERS = 10
kclusterer = KMeansClusterer(NUM_CLUSTERS,
                             distance=nltk.cluster.util.cosine_distance,
                             repeats=25)
assigned_clusters = kclusterer.cluster(big_data_copy, assign_clusters=True)
'''NEW PLAN, thanks Evan
    ONE HOT ENCODIGN BUT WITH ADDED UP VECTROS
    ex:
        Math 1 Art 2 Math 3 CS 50
    Joe     1   0       0    0
    Bob     0   0       1    0
    Smith   1   0       0    0
    Bob     0   1       0    0
    Smith   0   0       0    1

    groupByIndividual alphabetical is fine probably, jsut 
    want them to be same name next to each other

        Math 1 Art 2 Math 3 CS 50
    Joe     1   0       0    0
Ejemplo n.º 25
0
 def cluster(self, docs_repr):
     kclusterer = KM(self.n_clusters, distance=cosine_distance, repeats=25,avoid_empty_clusters=True)
     assigned_clusters = kclusterer.cluster(docs_repr, assign_clusters=True)
     return assigned_clusters
Ejemplo n.º 26
0
articals = []
for cutword in Red_df.cutword:
    articals.append(" ".join(cutword))
## 构建语料库,并计算文档--词的TF-IDF矩阵
vectorizer = CountVectorizer()
transformer = TfidfVectorizer()
tfidf = transformer.fit_transform(articals)

## tfidf 以稀疏矩阵的形式存储,将tfidf转化为数组的形式,文档-词矩阵
dtm = tfidf.toarray()

## 使用夹角余弦距离进行k均值聚类
kmeans = KMeansClusterer(num_means=2,       #聚类数目
                         distance=nltk.cluster.util.cosine_distance,  #夹角余弦距离
                         )
kmeans.cluster(dtm)

## 聚类得到的类别
labpre = [kmeans.classify(i) for i in dtm]
kmeanlab = Red_df[["ChapName","Chapter"]]
kmeanlab["cosd_pre"] = labpre
kmeanlab


## 查看每类有多少个分组
count = kmeanlab.groupby("cosd_pre").count()

## 将分类可视化
count.plot(kind="barh",figsize=(6,5))
for xx,yy,s in zip(count.index,count.ChapName,count.ChapName):
    plt.text(y =xx-0.1, x = yy+0.5,s=s)
Ejemplo n.º 27
0
    x_a = torch.max(x1 - w1 / 2.0, x2 - w2 / 2.0)
    y_a = torch.max(y1 - h1 / 2.0, y2 - h2 / 2.0)
    x_b = torch.min(x1 + h1 / 2.0, x2 + w2 / 2.0)
    y_b = torch.max(y1 + h1 / 2.0, y2 + h2 / 2.0)

    intersection = torch.clamp(x_b - x_a, min=0) * torch.clamp(y_b - y_a,
                                                               min=0)
    union = w1 * h1 + w2 * h2 - intersection

    return intersection / (union + 1e-6)


kclusterer = KMeansClusterer(args.num_bbox,
                             distance=nltk.cluster.util.cosine_distance,
                             repeats=25)
assigned_clusters = kclusterer.cluster(data, assign_clusters=True)

kmeans_wh = KMeans(n_clusters=args.num_bbox)
kmeans_wh.fit(train_wh)
bbox_priors = kmeans_wh.cluster_centers_
np.save('priors.npy', bbox_priors)
bbox_priors = torch.from_numpy(bbox_priors).cuda()

# Set up the network

features = DenseNet(growth_rate=8,
                    block_config=(4, 8, 16, 32),
                    activation=nn.LeakyReLU(inplace=True),
                    input_channels=3)

classifier = YOLOClassifier(features.output_channels, bbox_priors,
Ejemplo n.º 28
0
print("inferring vectors")
duplicate_dict = {}
used_lines = []
for i, t in enumerate(lines):
    if i % 2 == 0 and t not in duplicate_dict:
        duplicate_dict[t] = True
        used_lines.append(t)
        vectors.append(model.infer_vector(preprocess_document(t)))

print("done")

kclusterer = KMeansClusterer(NUM_CLUSTERS,
                             distance=nltk.cluster.util.cosine_distance,
                             repeats=25)
assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)

# clustersizes = []
#
# def distanceToCentroid():
#     for i in range(0,NUM_CLUSTERS):
#         clustersize = 0
#         for j in range(0,len(assigned_clusters)):
#             if (assigned_clusters[j] == i):
#                 clustersize+=1
#         clustersizes.append(clustersize)
#         dist = 0.0
#         centr = means[i]
#         for j in range(0,len(assigned_clusters)):
#             if (assigned_clusters[j] == i):
#                 dist += pow(nltk.cluster.util.cosine_distance(vectors[j], centr),2)/clustersize
def main(argv):
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', help='input file', required=True)
    parser.add_argument('-s', '--step', help='step', required=True)
    parser.add_argument('-ik', '--init_k', help='K initial', required=True)
    parser.add_argument('-fk', '--final_k', help='K final', required=True)
    parser.add_argument('-od',
                        '--distortion_out_file',
                        help='elbow distortion graph file',
                        required=True)
    parser.add_argument('-os',
                        '--silhouette_out_file',
                        help='elbow silhoutte graph',
                        required=True)
    parser.add_argument('-pca', '--pca', help='with pca', action='store_true')
    parser.add_argument('-k_pca', '--k_pca', help='k pca')
    ARGS = parser.parse_args()

    descriptors = load_dataset(ARGS.input_file)
    if ARGS.pca == True:
        print("With pca")
        pca = PCA(n_components=int(ARGS.k_pca))
        descriptors = pca.fit_transform(descriptors)

    ks = []
    distortions = []
    silhouettes = []

    for k in range(int(ARGS.init_k), int(ARGS.final_k), int(ARGS.step)):
        # kmeanModel = KMeans(n_clusters=k, init='k-means++')
        # kmeanModel.fit(descriptors)
        # predictions = kmeanModel.predict(descriptors)
        # cluster_centers_ = kmeanModel.cluster_centers_

        kclusterer = KMeansClusterer(
            k, distance=nltk.cluster.util.cosine_distance)
        predictions = kclusterer.cluster(descriptors, assign_clusters=True)
        predictions = np.array(predictions)
        cluster_centers_ = np.array(kclusterer.means())

        distortion = sum(
            np.min(distance.cdist(descriptors, cluster_centers_, 'cosine'),
                   axis=1)) / descriptors.shape[0]

        silhouette_score = metrics.silhouette_score(descriptors,
                                                    predictions,
                                                    metric='cosine')

        distortions.append(distortion)
        silhouettes.append(silhouette_score)
        ks.append(k)

        print("k:", k, "distortion:", distortion, "Silhouette Coefficient",
              silhouette_score)

    # Plot the elbow with distortion
    fig = plt.figure()
    plt.plot(ks, distortions, 'bx-')
    plt.grid()
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method')
    fig.savefig(ARGS.distortion_out_file)

    # Plot the elbow with distortion
    fig = plt.figure()
    plt.plot(ks, silhouettes, 'bx-')
    plt.grid()
    plt.xlabel('k')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Score analysis')
    fig.savefig(ARGS.silhouette_out_file)
def k_means_cluster():
    print("K means")

    # file = open("Subtree/sports.team.pro_athlete.txt","r")
    # op_pos = open("K_means/sports.team.pro_athlete_pos.txt","w")
    # op_neg = open("K_means/sports.team.pro_athlete_neg.txt","w")

    file = open("Test/test_subtree.txt", "r")
    op_pos = open("Test/test_sub_pos.txt", "w")
    op_neg = open("Test/test_sub_neg.txt", "w")

    new_list = file.readlines()
    new = []
    filtered_list = []
    temp_list = []

    for item in new_list:
        new.append(item)
        if (item == "\n"):
            temp_list.append(new)
            new = []

    for item in temp_list:
        if (len(item) == 1):
            continue
        else:
            filtered_list.append(item)

    data_matrix = random_sample.data_mat()
    new = filtered_list
    NUM_CLUSTERS = 5
    print("Start assigning clusters")
    kclusterer = KMeansClusterer(NUM_CLUSTERS,
                                 distance=nltk.cluster.util.cosine_distance,
                                 repeats=100,
                                 avoid_empty_clusters=True)
    assigned_clusters = kclusterer.cluster(data_matrix, assign_clusters=True)
    #print("Assigned clusters ",assigned_clusters)
    c0 = 0
    c1 = 0
    c2 = 0
    c3 = 0
    c4 = 0
    for item in assigned_clusters:
        if (item == 0):
            c0 = c0 + 1
        if (item == 1):
            c1 = c1 + 1
        if (item == 2):
            c2 = c2 + 1
        if (item == 3):
            c3 = c3 + 1
        if (item == 4):
            c4 = c4 + 1

    maximum = max(c0, c1, c2, c3, c4)
    print("Cluster grouping")
    if (maximum == c0):
        pos = []
        neg = []
        positive_pos = []
        negative_pos = []
        second_max = max(c1, c2, c3, c4)
        if (second_max == c1):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c2):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c3):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c4):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

    if (maximum == c1):
        pos = []
        neg = []
        positive_pos = []
        negative_pos = []
        second_max = max(c0, c2, c3, c4)
        if (second_max == c0):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c2):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c3):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

            return pos, neg, positive_pos, negative_pos
        if (second_max == c4):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

    if (maximum == c2):
        pos = []
        neg = []
        positive_pos = []
        negative_pos = []
        second_max = max(c0, c1, c3, c4)
        if (second_max == c0):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c1):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c3):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c4):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

    if (maximum == c3):
        pos = []
        neg = []
        positive_pos = []
        negative_pos = []
        second_max = max(c0, c1, c2, c4)
        if (second_max == c0):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c1):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c2):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

            return pos, neg, positive_pos, negative_pos
        if (second_max == c4):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

    if (maximum == c4):
        pos = []
        neg = []
        positive_pos = []
        negative_pos = []
        second_max = max(c0, c1, c2, c3)
        if (second_max == c0):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

        if (second_max == c1):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

        if (second_max == c2):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

        if (second_max == c3):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)