Beispiel #1
0
    def fit(self, X, y):
        if self.inferStds:
            # compute stds from data
            self.centers, self.stds = km.k_means(X, self.k)
        else:
            # use a fixed std
            self.centers, _ = km.k_means(X, self.k)
            dMax = max([
                np.abs(c1 - c2) for c1 in self.centers for c2 in self.centers
            ])
            self.stds = np.repeat(dMax / np.sqrt(2 * self.k), self.k)

        # training
        for epoch in range(self.epochs):
            for i in range(X.shape[0]):
                # forward pass
                a = np.array([
                    self.rbf(X[i], c, s)
                    for c, s, in zip(self.centers, self.stds)
                ])
                F = a.T.dot(self.w) + self.b

                loss = (y[i] - F).flatten()**2
                # print('Loss: {0:.2f}'.format(loss[0]))

                # backward pass
                error = -(y[i] - F).flatten()

                # online update
                #chu = np.dot(a , error)
                self.w = self.w - self.lr * a * error
                self.b = self.b - self.lr * error
Beispiel #2
0
def compare_dist(iters=100):
	pts = gen_data()
	cs = []
	dist = []
	for i in range(iters):
		c,_ = kmeans.k_means(pts, 3)
		cpp,_ = kmeans.k_means(pts, 3, kmeans.k_means_pp_initialization)
		d = kmeans.compute_avg2_distortion(pts, c)
		dpp = kmeans.compute_avg2_distortion(pts, cpp)
		dist.append([d,dpp])
		cs.append(np.array([c,cpp]))
	return np.array(dist), np.array(cs)
def validar_agrupamento(df):
    # Calcula kmeans quando K=2, K=3, K=4
    for i in range(2, 5):
        # Remove a última coluna, pois ela é a classe, vou utilizar ela para o algoritmo kmeans
        data_sem_classe = df.drop(df.columns[-1], axis=1, inplace=False)

        # Chama a função para realizar o kmeans
        data = k_means(data_sem_classe, i)

        # Adiciona a coluna "Classe" de volta ao Dataframe
        data['Classe'] = df[df.columns[-1]]

        # Agrupa por 'Classe' e 'Grupo'
        grouped_df = data.groupby(["Classe", "Grupo"])

        # Converte o objeto para Dataframe e calcula a contagem em cada grupo
        data_contagem = pd.DataFrame(
            grouped_df.size().reset_index(name="Contagem"))

        # Guarda em uma variável todos os tipos de classe, removendo aquelas repetidas por estarem em mais de um grupo
        classeList = sorted(set(data_contagem['Classe'].values.tolist()))

        # Monta uma estrutura de Dataframe para calcular a pureza
        data_estrutura = montar_estrutura(data_contagem, classeList, i)

        # Calcula a pureza
        data_pureza = calcula_pureza(data_estrutura, i)
def main():
    clusters = args.clusters

    if clusters not in [4, 6, 8, 10]:
        raise ValueError("Number of clusters must be 4, 6, 8, or 10!")

    data_path = './data/Mall_Customers.csv'
    attr = ['Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)']

    f1, f2 = 'Age', 'Spending Score (1-100)'

    df = pd.read_csv(data_path)
    data = df[[f1, f2]].iloc[:, :].to_numpy()

    if f1 == 'Gender':
        gender = data[:, 0]

        gender[gender == 'Male'] = 5.
        gender[gender == 'Female'] = 10.
        print(gender)
        data[:, 0] = gender

    print(f'Shape of the data: {data.shape}')

    km = k_means(num_clusters=clusters, tol=1e-4)
    est_centroid, history_centroids, predict_labels, loss, num_iter = km.get_cluster(
        data)

    plotting(predict_labels, data, clusters, est_centroid, f1, f2)
Beispiel #5
0
def outer_criteria(X, k_range, reference, iterations=200):
    n = len(X)
    RS = dict()  # Rand Statistic
    FM = dict()  # Fowlkes-Mallows

    for k in k_range:
        TP, FN, FP, TN = (0, ) * 4
        if k < 2:
            continue

        _, clusters = kmeans.k_means(X, k, iterations)
        # Compute TP, FN, FP, TN.
        for i in range(n):
            for j in range(i + 1, n):
                if clusters[i] == clusters[j] and reference[i] == reference[j]:
                    TP += 1
                elif clusters[i] != clusters[j] and reference[i] == reference[
                        j]:
                    FN += 1
                elif clusters[i] == clusters[
                        j] and reference[i] != reference[j]:
                    FP += 1
                else:
                    TN += 1
        RS[k] = (TP + TN) / n
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        FM[k] = np.sqrt(precision * recall)

    best_rs = max(RS, key=RS.get)
    best_fm = max(FM, key=FM.get)

    return RS, best_rs, FM, best_fm
Beispiel #6
0
def k_means_test():
    X = np.concatenate((np.random.normal(5, 30,
                                         100), np.random.normal(150, 44, 100),
                        np.random.normal(300, 20, 100)))
    Y = np.concatenate((np.random.normal(5, 30,
                                         100), np.random.normal(150, 44, 100),
                        np.random.normal(300, 20, 100)))
    data = np.column_stack((X, Y))

    clusters, result, centroids = k_means(data, 3, euclidean_dist)

    cluster_points = []
    for cluster in clusters:
        Xaux = [X[i] for i in cluster]
        Yaux = [Y[i] for i in cluster]
        cluster_points.append((Xaux, Yaux))

    plt.figure()
    colors = [
        'green', 'red', 'blue', 'yellow', 'gray', 'lightblue', 'pink', 'black'
    ]
    i = 0
    for cluster in cluster_points:
        X, Y = cluster
        plt.scatter(X, Y, c=colors[i])
        i = (i + 1) % len(colors)

    plt.show()
def spectral_cluster(W, k):
    L = get_laplacian(W)
    Lambda, v = np.linalg.eig(L)
    length = len(Lambda)
    eigen_pairs = [(Lambda[i], v[:, i]) for i in range(length)]
    eigen_pairs = sorted(eigen_pairs, reverse=False, key=lambda k: k[0])
    temp = np.column_stack((eigen_pairs[i][1] for i in range(k)))
    result = kmeans.k_means(temp.T, 2, 5)

    return result
def main():

    print "\n>> Since, matplotlib has inconsistent behaviour, I am saving all the generated plots in the directory './images'\n"

    if not os.path.exists('./images'):
        os.mkdir('./images')
    
    print "> Running K-Means for blob_data\n\n"
    blob_data = np.genfromtxt('./hw5_blob.csv', delimiter=',')
    for k in [2, 3, 5]:
        clusters = k_means(blob_data, k, "blob", True)
    
    print "> Running K-Means for circle_data\n\n"
    circle_data = np.genfromtxt('./hw5_circle.csv', delimiter=',')
    for k in [2, 3, 5]:
        clusters = k_means(circle_data, k, "circle", True)
    
    print "> Running Kernel K-Means for circle_data\n\n"
    kernel_kmeans.main()

    print "> Running EM Algorithm for blob_data\n\n"
    em_algorithm.main()
def compress_image(im_path, k, init_f=uniform_mode_dist_init):
    """
	returns the original and compressed version of an image together
	with time profile data

	Arguments:
	im_path: string
	k: int
	init_f: function of 2d array x 1d array x int x function -> 2d array

	Output:
	image: numpy 2d numerical array
	compressed_image numpy 2d numerical array
	mse: float
	time_profile: dict of string -> float
	"""
    # Read image
    image = cv2.imread(im_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # Store original shape
    original_shape = image.shape
    # Reshape into a numpy 2d array
    image = image.reshape([image.shape[0] * image.shape[1], 3])

    # Run k-means
    t0 = time.time()
    c_means, clusters, mse, time_profile = k_means(image, k, rgb_distance,
                                                   init_f, pixel_to_str,
                                                   str_to_pixel)
    t1 = time.time()
    time_profile['k_means'] = t1 - t0

    # Create compressed image
    compressed_image = np.zeros(image.shape, dtype=np.uint8)
    for i in range(compressed_image.shape[0]):
        compressed_image[i] = c_means[clusters[i]].astype(np.uint8)

    # Return to original shape
    image = image.reshape(original_shape).astype(np.uint8)
    compressed_image = compressed_image.reshape(original_shape).astype(
        np.uint8)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    compressed_image = cv2.cvtColor(compressed_image, cv2.COLOR_RGB2BGR)

    aid = ptp_idm(image, compressed_image)

    return image, compressed_image, mse, aid, time_profile
Beispiel #10
0
def main(argv):
    config_path = args.conf
    num_anchors = args.anchors

    with open(config_path) as config_buffer:
        config = json.loads(config_buffer.read())

    data = {
        'train': {
            'image_folder': config['train']['train_image_folder'],
            'annot_folder': config['train']['train_annot_folder'],
        },
        'valid': {
            'image_folder': config['valid']['valid_image_folder'],
            'annot_folder': config['valid']['valid_annot_folder'],
        }
    }

    video_folder_list, video_annot_list = data_preparation(data['train'],
                                                           FOR_YOLO=True)

    grid_w = config['model']['input_size'] / 32
    grid_h = config['model']['input_size'] / 32

    cell_w = 1280.0 / grid_w
    cell_h = 720.0 / grid_h

    # run k_mean to find the anchors
    annotation_dims = []
    for video_annot in video_annot_list:
        labels = np.loadtxt(video_annot, delimiter=',')
        for label in labels:
            relative_w = label[2] / cell_w
            relative_h = label[3] / cell_h
            if math.isnan(relative_w) or math.isnan(relative_h):
                # print("NaN annotations! {}".format(basename(video_annot)))
                1
            else:
                annotation_dims.append(map(float, (relative_w, relative_h)))
    annotation_dims = np.array(annotation_dims)

    centroids, cluster_assignment = k_means(annotation_dims, num_anchors)

    # write anchors to file
    print '\naverage IOU for', num_anchors, 'anchors:', '%0.2f' % avg_IOU(
        annotation_dims, centroids)
    print_anchors(centroids)
Beispiel #11
0
def main():
    #data, label = datasets.load_iris().data, datasets.load_iris().traget
    #data = load_data('F:\machine learning\PythonCode\dataset\cluster\spectral_cluster_data.txt')
    data, label = load_data_with_label('F:\machine learning\PythonCode\dataset\cluster\Spiral.txt')
    data = normal_data(data)
    #显示原来的数据分布
    plt.ion()
    #kmeans.mat_plot_sample(data)
    mat_plot_cluster_sample(data, label, 'original sample')
    # k-means聚类,对比实验
    k_dist1, k_centers1, cluster_group = kmeans.k_means(data, 3)
    kmeans.mat_plot_k_means_sample(k_dist1, k_centers1)
    # 谱聚类,第三个参数为聚类是采用的算法,有'NJW'、'self-tuning'、'sl',默认为'sl'
    spectral_cluster(data, 3)
    spectral_cluster(data, 3, 'NJW')
    spectral_cluster(data, 3, 'self-tuning')
    plt.ioff()
    plt.show()
def training_model(number_of_templates, using_kmeans, read_feature_from_file=0):
    """
    training the model using DTW to recognize the records
    :param number_of_templates: int, using number_of_templates to train the model
    :param using_kmeans: 1 if using k-means to templates to generate some states
    :param read_feature_from_file:1 then read mfcc feature from file,0 then compute mfcc feature from records
    :return: DTW_obj:a DTW object,the trained model
    :return: inputs:list,every element of it is a template using for test
    """
    templates = []
    inputs = []
    for digit in xrange(0, 10):
        temp = get_templates_and_inputs(digit, number_of_templates, read_feature_from_file)
        # print 'temp',temp[0]
        # print 'k_means(temp[0], number_of_states=5)',kmeans.k_means(temp[0], number_of_states=5)
        templates.extend(
            kmeans.k_means(temp[0], number_of_states=5)[0] if using_kmeans and number_of_templates == 5 else temp[0])
        inputs.extend(temp[1])
    DTW_obj = DTW.DTW(templates)
    return DTW_obj, inputs
Beispiel #13
0
def spectral_clustering(data, k, gamma=0.1):
    print("Spectral clustering...")
    # Number of samples
    n = data.shape[0]

    # Adjacency matrix
    W = rbf_kernel_gram_matrix(data, gamma)

    # Build Degree matrix
    D = compute_degrees(W)

    # Graph Laplacian
    L = D - W

    # Normalized Cut
    D_inv_sqrt = scipy.linalg.fractional_matrix_power(D, -1 / 2)
    L_sym = D_inv_sqrt.dot(L).dot(D_inv_sqrt)

    # Compute the first k eigenvectors of L_sym
    eig_values, eig_vectors = LA.eig(L_sym)

    # T contains first k eigenvectors of normalized Laplacian
    T = np.zeros((n, k))
    for i in range(k):
        T[:, i] = eig_vectors[:, i]

    # Resubstitude matrix H and normalize by its rows.
    H = D_inv_sqrt.dot(T)
    H /= LA.norm(H, axis=1, ord=2)[:, np.newaxis]

    # Cluster data points in eigenspace
    centroids, cluster_assignment = k_means(H, k)

    # Show the data points at the same point in eigenspace.
    # discrete_h = np.zeros_like(H)
    # c, discrete_h[:, 0] = k_means(H[:, 0].reshape(-1, 1), k)
    # c, discrete_h[:, 1] = k_means(H[:, 1].reshape(-1, 1), k)
    # centroids, cluster_assignment = k_means(discrete_h, k)

    return cluster_assignment
Beispiel #14
0
def inner_criteria(X, k_range, iterations=200):
    DB = dict()  # Davies-Bouldin
    CH = dict()  # Calinski-Harabasz

    for k in k_range:
        if k < 2:
            continue
        centroids, clusters = kmeans.k_means(X, k, iterations)
        DB[k] = DaviesBouldin(X, centroids, clusters)
        CH[k] = CHIndex(X, centroids, clusters)

    ch_list = list(CH.items())

    db_best = min(DB, key=DB.get)
    ch_best = 0
    delta = sys.maxsize
    for k in range(1, len(CH) - 1):
        temp = ch_list[k + 1][1] - 2 * ch_list[k][1] + ch_list[k - 1][1]
        if temp < delta:
            delta = temp
            ch_best = ch_list[k][0]

    return DB, db_best, CH, ch_best
Beispiel #15
0
def training_model(number_of_templates,
                   using_kmeans,
                   read_feature_from_file=0):
    """
    training the model using DTW to recognize the records
    :param number_of_templates: int, using number_of_templates to train the model
    :param using_kmeans: 1 if using k-means to templates to generate some states
    :param read_feature_from_file:1 then read mfcc feature from file,0 then compute mfcc feature from records
    :return: DTW_obj:a DTW object,the trained model
    :return: inputs:list,every element of it is a template using for test
    """
    templates = []
    inputs = []
    for digit in xrange(0, 10):
        temp = get_templates_and_inputs(digit, number_of_templates,
                                        read_feature_from_file)
        # print 'temp',temp[0]
        # print 'k_means(temp[0], number_of_states=5)',kmeans.k_means(temp[0], number_of_states=5)
        templates.extend(
            kmeans.k_means(temp[0], number_of_states=5)[0]
            if using_kmeans and number_of_templates == 5 else temp[0])
        inputs.extend(temp[1])
    DTW_obj = DTW.DTW(templates)
    return DTW_obj, inputs
    f = open( 'docs.json', 'r' )
    for doc in f.readlines():
        #Remove the '\n' at the end of doc
        doc = doc.strip('\n')
        doc_dict = json.loads( doc )
        Doc_List.append( doc_dict )
        #obtain Title_List
        Title_List.append( doc_dict[ 'title' ] )       
    f.close()
    
    #Calculate tf-idf of each word in each doc:
    #doc_collection's structure: [doc_vector1,doc_vector2,...]
    #doc_vector's structure:{word1:tf-idf val, word2:tf-idf val,...}
    doc_collection = TF_IDF()
    #this block will generate value_collection. Its structure is: [[1,2,0,1,2,...],[0,2,1,3,0...],...]
    value_collection = []
    for doc_vector in doc_collection:
        doc_value = []
        for word in doc_vector:
            doc_value.append( doc_vector[ word ] )
        value_collection.append( doc_value )

    #print value_collection
    #K Means is coming!
    k = raw_input( "Please input a value for k: " )
    k = int( k )
    
    KMeans = kmeans.k_means( k )
    
    KMeans.K_MEANS( value_collection, Title_List )
Beispiel #17
0
def serial_kmeans(data, k, dist):
  return kmeans.k_means(data,k,dist)
dataSet = []
# 打开数据文件
fileIn = open('testSet.txt')
# 读取每一行,使用中间的tab进行切割
for line in fileIn.readlines():
    # 切割
    lineArr = line.strip().split('\t')
    # 存入数据列表(有序),转化为float数据类型,数据文件的每行根据tab分割为两部分,使用下标来访问
    dataSet.append([float(lineArr[0]), float(lineArr[1])])
# --------------------------

# **************************
# step 2: clustering...
# 第二步:聚类...
print "第二步:聚类..."
# 使用mat函数把列表(数组)转换成矩阵
dataSet = mat(dataSet)
# 参数k为聚类中心数目
k = 4
# 执行kmeans文件中的k_means函数,传入矩阵和聚类中心数目k参数,分别赋值
centroids, clusterAssment = kmeans.k_means(dataSet, k)
# **************************

# ++++++++++++++++++++++++++
# step 3: show the result...
# 第三步:显示结果...
print "第三步:显示结果..."
# 执行kmeans文件中的显示图表函数
kmeans.showCluster(dataSet, k, centroids, clusterAssment)
# ++++++++++++++++++++++++++
        reader = csv.reader(f)  #iterator object
        vote_topic = next(reader)  #acc. to csv data
        headers = next(reader)  #acc. to csv data
        for person, state, district, vote, name, party in reader:
            senator = Senator(name, party, state)
            accumulated_record[senator].append(vote_value[vote])
pprint(accumulated_record, width=500)

record = {
    senator: tuple(votes)
    for senator, votes in accumulated_record.items()
}  #type: Dict[Senator,VoteHistory]
pprint(record, width=500)

#use kmeans to locate the cluster centroids from patterns of votes, assign each senetor to the nearest cluster
centroids = k_means(data=record.values(), num_centroids=3)
clustered_votes = assign_data(centroids=centroids, data=record.values())
pprint(clustered_votes)

#build a reverse mapping from a vote history to a list of senators that voted that way
votes_to_senators = defaultdict(
    list)  #type: DefaultDict[VoteHistory, List[Senator]]
for senator, vote_history in record.items():
    votes_to_senators[vote_history].append(senator)

#we know that come senators might have the same voting history,
#also check if all 100 are present even after rearranging by using assertion
assert sum([len(cluster)
            for cluster in votes_to_senators.values()]) == NUM_SENATORS

#display clusters and members of each clusters
Beispiel #20
0
def cluster_k(db_path, file_name, file_name_p, n_clusters):
    score = kmeans.k_means(dump_path=db_path, file_name=file_name, file_name_p=file_name_p, n_clusters=int(n_clusters))    
    print '{0}:{1}'.format(n_clusters, score)
Beispiel #21
0
vote_value = {'Nay': -1, 'Not Voting': 0, 'Yea': 1} # type: Dict[str, VoteValue]
accumulated_record = defaultdict(list)              # type: DefaultDict[Senator, List[VoteValue]]
for filename in glob.glob('congress_data/*.csv'):
    with open(filename) as f:
        reader = csv.reader(f)
        vote_topic = next(reader)
        headers = next(reader)
        for person, state, district, vote, name, party in reader:
            senator = Senator(name, party, state)
            accumulated_record[senator].append(vote_value[vote])

# Transform record into plain dict mapping a senator to a tuple of vote values
record = {senator: tuple(votes) for senator, votes in accumulated_record.items()} # type: Dict[Senator, Tuple[VoteValue, ...]]

# Use k-means to locate the cluster centroids and assign senators to the nearest cluster
centroids = k_means(record.values(), k=3, iterations=50)
clustered_votes = assign_data(centroids, record.values())

# Build a reverse mapping from a pattern of votes to senators who voted that way
votes_to_senators = defaultdict(list)   # type: DefaultDict[Tuple[VoteValue, ...], List[Senator]]
for senator, votes in record.items():
    votes_to_senators[votes].append(senator)
assert sum(map(len, clustered_votes.values())) == 100

# Display the clusters and the members of each cluster
for i, votes_in_cluster in enumerate(clustered_votes.values(), start=1):
    print(f'=========== Voting Cluster #{i} ===========')
    party_totals = Counter()            # type: Counter
    for votes in set(votes_in_cluster):
        for senator in votes_to_senators[votes]:
            party_totals[senator.party] += 1
Beispiel #22
0
for i in range(0, len(test_dataset_np)):
    if test_dataset_np[i][0] == 1:
        test_count[0] += 1
    if test_dataset_np[i][0] == 4:
        test_count[1] += 1
    if test_dataset_np[i][0] == 8:
        test_count[2] += 1

#print(train_count)
train_dataset_np_order = train_dataset_np[
    train_dataset_np[:, 0].argsort()]  #The number:1:1005, 4:652, 8:542
test_dataset_np_order = test_dataset_np[test_dataset_np[:, 0].argsort()]

#print(train_dataset_np_order)

result = k_means(train_dataset_np_order, 3, 1000)
print(result[1])

#A method to assign class labels to each of your clusters:
acc = Cal_Accuracy_testdata(result[1], train_count)
print("The acc of training is :", acc)

#Predict labels on the zip.test data:
labels = testdata_classify(test_dataset_np_order, result[2])
print("Print labels", labels)

#Acc:
acc_test = Cal_Accuracy_testdata(labels, test_count)
print("The acc of test is :", acc_test)

#PCA:
def get_transform_relationship_FSM_table(filename, states_in_each_word=5, using_continuous_feature=0, **kwargs):
    """
    从有限状态机中获取模板,状态转移列表,起始状态,每个nonemitting state所对应的emitting state 在模板中连续
    :param filename: 有限状态机文件名
    :param states_in_each_word: 每个单词有多少states组成
    :param kwargs:
    :return: template,states的模板
             transform_list,第i个元素为列表,代表第i个state可以从哪些states进入
             begin_list,t=0时可进入的states
             word_in_template,第i个表示template中第i个元素代表的word
    """
    start_states, terminal_states, nonemitting_transform_list = get_info_FSM_table(filename)
    number_of_nonemitting_states = len(nonemitting_transform_list)
    template = []
    digit_template = []  # 第i个值表示数字i的template
    number_of_frames_in_each_state = []  # 第i个元素代表state i的frame数
    number_of_frames_in_each_state_for_digit = []  # 第i个元素代表数字i中每个state拥有的frame数量
    covariance_matrix_in_each_state = []
    mean_in_each_state = []
    covariance_matrix_for_each_digit = []
    mean_for_each_digit = []
    word_in_template = []  # 第i个表示template中第i个元素代表的word
    if using_continuous_feature:
        digit_template = SR.get_digit_feature_from_continuous_speech()
    for digit in xrange(0, 10):
        temp = SR.get_templates_and_inputs(digit, number_of_templates=10)
        if not using_continuous_feature:
            result = kmeans.k_means(temp[0], number_of_states=5)
            digit_template.extend(result[0])
            covariance_matrix_for_each_digit.append(result[2])
            mean_for_each_digit.append(result[3])
            temp_number_of_frames_in_each_state = [0 for i in xrange(states_in_each_word)]
            for number_of_frames_in_each_state_in_each_template in result[5]:
                for i in xrange(states_in_each_word):
                    temp_number_of_frames_in_each_state[i] += number_of_frames_in_each_state_in_each_template[i]
            number_of_frames_in_each_state_for_digit.append(temp_number_of_frames_in_each_state)
    # print number_of_frames_in_each_state
    # 获取number_of_emitting_states_begin_from_nonemitting_states
    number_of_emitting_states_begin_from_nonemitting_states = []  # 第i个元素表示源于第i个nonemitting_state的edge包含的states个数
    for nonemitting_state in nonemitting_transform_list:
        # nonemitting_state形如{1: [2, 3, 4, 5, 6, 7, 8, 9], 3: []}
        temp = []
        for edge_list in nonemitting_state.values():
            temp.extend(edge_list)
        for i in temp:
            # 获取template
            covariance_matrix_in_each_state.extend(covariance_matrix_for_each_digit[i])
            mean_in_each_state.extend(mean_for_each_digit[i])
            number_of_frames_in_each_state.extend(number_of_frames_in_each_state_for_digit[i])
            template.extend(digit_template[i])
            word_in_template.append(i)
        number_of_emitting_states = len(set(temp))
        number_of_emitting_states_begin_from_nonemitting_states.append(number_of_emitting_states)
    # 获取begin_states_index_for_each_nonemitting_state
    begin_states_index_for_each_nonemitting_state = []  # 第i个元素表示源于第i个nonemitting_state的edge包含的states之前有多少states
    for i in xrange(number_of_nonemitting_states):
        begin_states_index_for_each_nonemitting_state.append(
            sum(number_of_emitting_states_begin_from_nonemitting_states[:i]))
    # 获取emitting_out_list,第i个元素为一个列表,代表第i个nonemitting state可以进入的states
    emitting_out_list = [[] for i in xrange(number_of_nonemitting_states)]
    for i, nonemitting_state in enumerate(nonemitting_transform_list):
        # nonemitting_state形如{1: [2, 3, 4, 5, 6, 7, 8, 9], 3: []}
        begin_index = begin_states_index_for_each_nonemitting_state[i]
        number_of_edges = sum(map(len, nonemitting_state.values()))
        for j in xrange(number_of_edges):
            emitting_out_list[i].append(states_in_each_word * (begin_index + j))
    changed = True
    # 考虑无条件跳转的情况
    while changed:
        new_emitting_out_list = emitting_out_list[:]
        for i, nonemitting_state in enumerate(nonemitting_transform_list):
            for key in nonemitting_state.keys():
                if nonemitting_state[key] == []:
                    new_emitting_out_list[i].extend(new_emitting_out_list[key])
        changed = (new_emitting_out_list != emitting_out_list)
    emitting_out_list = new_emitting_out_list
    # 获取emitting_in_list,第i个元素为一个列表,代表第i个nonemitting state可以由哪些states进入
    emitting_in_list = [[] for i in xrange(number_of_nonemitting_states)]
    for i, nonemitting_state in enumerate(nonemitting_transform_list):
        # nonemitting_state形如{1: [2, 3, 4, 5, 6, 7, 8, 9], 3: []}
        begin_index = begin_states_index_for_each_nonemitting_state[i]
        j = 0
        for key in nonemitting_state.keys():
            for value in nonemitting_state[key]:
                emitting_in_list[key].append(states_in_each_word * (begin_index + j))
                j += 1
    changed = True
    # 考虑无条件跳转的情况
    while changed:
        new_emitting_in_list = emitting_in_list[:]
        for i, nonemitting_state in enumerate(nonemitting_transform_list):
            for key in nonemitting_state.keys():
                if nonemitting_state[key] == []:
                    new_emitting_in_list[key].extend(new_emitting_in_list[i])
        changed = (new_emitting_in_list != emitting_in_list)
    emitting_in_list = new_emitting_in_list
    # 计算transform_list,每个元素为一个列表,表示可以进入这个state的states
    transform_list = [[] for i in
                      xrange(sum(number_of_emitting_states_begin_from_nonemitting_states) * states_in_each_word)]
    for i in xrange(number_of_nonemitting_states):
        for next_state in emitting_out_list[i]:
            for cur_state in emitting_in_list[i]:
                transform_list[next_state].append(cur_state + 4)
    for i, element in enumerate(transform_list):
        element.append(i)
        if i % states_in_each_word > 0:
            element.append(i - 1)
        if i % states_in_each_word > 1:
            element.append(i - 2)
    begin_states = emitting_out_list[0]
    return template, transform_list, begin_states, word_in_template, number_of_frames_in_each_state, covariance_matrix_in_each_state, mean_in_each_state
Beispiel #24
0
vote_value: Dict[str, VoteValue] = {'Nay': -1, 'Not Voting': 0, 'Yea': 1}
accumulated_record: DefaultDict[Senator, List[VoteValue]] = defaultdict(list)
for filename in glob.glob('congress_data/*.csv'):
    with open(filename, encoding='utf-8') as f:
        reader = csv.reader(f)
        vote_topic = next(reader)
        headers = next(reader)
        for person, state, district, vote, name, party in reader:
            senator = Senator(name, party, state)
            accumulated_record[senator].append(vote_value[vote])

# Transform record into plain dict mapping a senator to a tuple of vote values
record: Dict[Senator, Tuple[VoteValue, ...]] = {senator: tuple(votes) for senator, votes in accumulated_record.items() }

# Use k-means to locate the cluster centroids and assign senators to the nearest cluster
centroids = k_means(record.values(), k=3, iterations=50)
clustered_votes = assign_data(centroids, record.values())

# Build a reverse mapping from a pattern of votes to senators who voted that way
votes_to_senators: DefaultDict[Tuple[VoteValue, ...], List[Senator]] = defaultdict(list)
for senator, votes in record.items():
    votes_to_senators[votes].append(senator)
assert sum(len(cluster) for cluster in clustered_votes.values()) == 100

# Display the clusters and the members of each cluster
for i, votes_in_cluster in enumerate(clustered_votes.values(), start=1):
    print(f'=========== Voting Cluster #{i} ===========')
    party_totals: Counter = Counter()
    for votes in set(votes_in_cluster):
        for senator in votes_to_senators[votes]:
            party_totals[senator.party] += 1
import seaborn as sns
import numpy as np
import pandas as pd

from skimage import io
#import imp
#kmean = imp.load_source('./kmeans.py')
#import kmean
import kmeans as keman

pic = io.imread('./data/bird_small.png') / 255.
#io.imshow(pic)
data = pic.reshape(128 * 128, 3)

#k-mean
C, centroids, cost = keman.k_means(pd.DataFrame(data), 16, epoch=10, n_init=3)
compressed_pic = centroids[C].reshape(
    (128, 128, 3))  #a[b]中每个元素为b[i]为索引,对应a中的每行

#sklearn KMeans
#from sklearn.cluster import KMeans
#model = KMeans(n_clusters=16, n_init=100, n_jobs=-1)
#model.fit(data)
#centroids = model.cluster_centers_
#C = model.predict(data)
#compressed_pic = centroids[C].reshape((128,128,3))

fix, ax = plt.subplots(1, 2)
ax[0].imshow(pic)
ax[1].imshow(compressed_pic)
plt.show()
NC=list(range(3, 4))#numOfModels+1)) # list of numbers of clusters
accuracies = np.zeros((numOfEPS, len(NC)))
clusteringResult = {}
for numOfClusters in NC:
    clusteringResult[numOfClusters] = []

with open("clustering_result.txt", "w") as fp:
    for numOfClusters in NC:
        # clustering into c groups
        print("Clustering: {} clusters".format(numOfClusters))
    #    kmeans = KMeans(n_clusters=numOfClusters, random_state=0).fit(predVec)
    #    for c in range(numOfClusters):
    #        clusteringResult[numOfClusters].append(np.where(kmeans.labels_ == c)[0])
    #        print(np.where(kmeans.labels_ == c)[0])

        assignments = k_means(predVec, numOfClusters, "L2", "ZerosFarAway")
        fp.write("## number of clusters: "+str(numOfClusters)+"\n")
        for c in range(numOfClusters):
            cluster = np.where(assignments==c)[0]
            clusteringResult[numOfClusters].append(cluster)
            print(cluster)
            fp.write("\t"+str(cluster)+"\n")
        fp.write("\n")

def vote1(participants):
    '''
        Input:
            participants: a list of opinions. Each element in the list is a numpy array, N X 2.
                            N is the number of events. The second dimension contains (opinion/label, confidence)
        Output:
            voteResult  : a numapy array NX2 represents opinion and confidence across N events 
Beispiel #27
0
        reader = csv.reader(f)
        vote_topic = next(reader)
        header = next(reader)
        for person, state, district, vote, name, party in reader:
            senator = Senator(name, party, state)
            accumulated_record[senator].append(vote_value[vote])

# Transform the record into a plain dict that maps to a tuple of votes.
record = {
    senator: tuple(votes)
    for senator, votes in accumulated_record.items()
}  # type: Dict[Senator, VoteHistory]

# Use k-means to locate the cluster centroids from pattern of votes, assign
# each senator to the nearest cluster.
centroids = k_means(record.values(), k=3)
clustered_votes = assign_data(centroids, record.values())

# Build a reverse mapping from a vote history to a list of senators who voted
# that way.
votes_to_senators: DefaultDict[VoteHistory, List[Senator]] = defaultdict(list)
for senator, votehistory in record.items():
    votes_to_senators[votehistory].append(senator)
assert sum(len(cluster)
           for cluster in votes_to_senators.values()) == NUM_SENATORS

# Display the clusters and the members (senators) of each cluster.
for i, votes_in_cluster in enumerate(clustered_votes.values(), start=1):
    print(f"==================== Voting Cluster #{i} ====================")
    party_totals: Counter[str] = Counter()
    for votes in set(votes_in_cluster):
Beispiel #28
0
if __name__ == "__main__":
    data = sio.loadmat('ExtYaleB10.mat')
    train_sample = data['train']
    test_sample = data['test']
    x_train_full = np.column_stack(train_sample[0, i][:, :,
                                                      j].reshape(192 * 168, 1)
                                   for i in range(10) for j in range(50))
    x_test_full = np.column_stack(test_sample[0, i][:, :,
                                                    j].reshape(192 * 168, 1)
                                  for i in range(10) for j in range(14))
    I = np.identity(10)
    y_train = np.column_stack(I[:, i] for i in range(10) for j in range(50))
    y_test = np.column_stack(I[:, i] for i in range(10) for j in range(14))

    print("spectral_clustering computing")
    v = spectral.spectral_clustering(x_train_full, 10, 1, 10)
    for i in range(v.shape[1]):
        v[:, i] = v[:, i] / (la.norm(v[:, i]))
    final_c, final_z = km.k_means(np.asarray(v.T), 10, 10)
    y_train = np.zeros(500, )
    cost = 0
    for i in range(10):
        for j in range(50):
            y_train[i * 50 + j] = i
    for i in range(500):
        for j in range(10):
            if final_z[i, j] == 1:
                if y_train[i] != j:
                    cost += 1
    print("Errors of spectral clustering", cost)
Beispiel #29
0
    with open(filename) as f:
        reader = csv.reader(f)
        vote_topic = next(reader)
        headers = next(reader)
        for person, state, district, vote, name, party in reader:
            senator = Senator(name, party, state)
            accumulated_record[senator].append(vote_value[vote])

# Transform record into a plain dict maps a senator to a tuple of vote values
record = {
    senator: tuple(votes)
    for senator, votes in accumulated_record.items()
}

# Show-off our talent with k-means advances machine learning like skynet or the HAL5000 or Deep Blue
centroids = kmeans.k_means(list(record.values()), k=2, iterations=50)
clustered_votes = kmeans.assign_data(centroids, list(
    record.values()))  # type: Dict[Tuple[VoteValue], List[Tuple[VoteValue]]

# Build a reverse mapping.  Given a voting record, get the list of senators who voted that way
votes_to_senators = collections.defaultdict(list)
for senator, votes in record.items():
    votes_to_senators[votes].append(senator)

# Display the clusters and the people who voted that way
for i, votes_in_clusters in enumerate(clustered_votes.values(), start=1):
    print(
        f'======================= Voting Cluster #{i} ========================='
    )
    for votes in set(votes_in_clusters):
        for senator in votes_to_senators[votes]:
Beispiel #30
0
if __name__ == '__main__':
    data_dir = 'data'
    results_dir = 'results'
    os.makedirs(results_dir, exist_ok=True)
    clusters_range = range(2, 10)

    # Inner criterias' block.
    image_path = os.path.join(data_dir, 'policemen.jpg')
    image = np.array(Image.open(image_path), dtype=np.uint8)
    new_image = image.reshape(image.shape[0] * image.shape[1], image.shape[2])
    db, best_db, ch, best_ch = inner_criteria(new_image, clusters_range)
    best_inner = (best_db + best_ch) // 2

    # Save the clustered image.
    centroids, clusters = kmeans.k_means(new_image, best_inner, iterations=200)
    new_image = np.vstack([centroids[i] for i in clusters
                           ]).astype(np.uint8).reshape(image.shape)
    Image.fromarray(new_image).save(
        os.path.join(results_dir, '%d-clusters-policemen.jpg' % best_inner))

    # Outer criterias' block.
    outer_criterias_input = os.path.join(data_dir, 'outer_criterias_input.txt')
    data = np.loadtxt(outer_criterias_input, delimiter=' ')
    reference, points = data[:, 0], data[:, 1:]
    rs, best_rs, fm, best_fm = outer_criteria(points, clusters_range,
                                              reference)

    # Draw the results.
    fig, ax = plt.subplots(nrows=2, ncols=2)
    ax1, ax2, ax3, ax4 = ax.flatten()
Beispiel #31
0
def main():
    r = 5  # number of random initial
    d = 2  # reduced dimension
    k = 2  # number of cluster
    N = 200  # sample size
    part = input("Input part A/B \n")
    number = input("Input number 1~6 \n")

    if (part == 'A'):
        data = sio.loadmat('HW3_Data/dataset1.mat')
        Y = data['Y']
        if (number == 1):
            plt.close()
            x1 = Y[0, :]
            y1 = Y[1, :]
            plt.scatter(x1, y1)
            plt.show()
            plt.close()
        if (number == 2):
            plt.close()
            x1 = Y[0, :]
            y1 = Y[2, :]
            plt.scatter(x1, y1)
            plt.show()
        if (number == 3):
            plt.close()
            u, y_reduced = pca.pca(Y, 2)
            x = y_reduced[0, :]
            y = y_reduced[1, :]
            plt.scatter(np.asarray(x), np.asarray(y))
            plt.show()
        if (number == 4):
            plt.close()
            result = kmeans.k_means(np.matrix(Y), k, r)
            x1 = []
            y1 = []
            x2 = []
            y2 = []
            U, y_2d = pca.pca(Y, d)
            for i in range(N):
                if (result[i] == 0):
                    x1.append(y_2d[0, i])
                    y1.append(y_2d[1, i])
                else:
                    x2.append(y_2d[0, i])
                    y2.append(y_2d[1, i])

            plt.scatter(x1, y1, color='red')
            plt.scatter(x2, y2, color='blue')
            plt.show()
        if (number == 5):
            plt.close()
            U, y_2d = pca.pca(Y, d)
            result = kmeans.k_means(y_2d, k, r)

            x1 = []
            y1 = []
            x2 = []
            y2 = []

            for i in range(200):
                if (result[i] == 0):
                    x1.append(y_2d[0, i])
                    y1.append(y_2d[1, i])
                else:
                    x2.append(y_2d[0, i])
                    y2.append(y_2d[1, i])

            plt.scatter(x1, y1, color='red')
            plt.scatter(x2, y2, color='blue')
            plt.show()

    if (part == 'B'):
        data = sio.loadmat('HW3_Data/dataset2.mat')
        Y = data['Y']
        if (number == 1):
            x1 = Y[0, :]
            y1 = Y[1, :]
            plt.scatter(x1, y1)
            plt.show()

        if (number == 2):

            x1 = Y[0, :]
            y1 = Y[2, :]
            plt.scatter(x1, y1)
            plt.show()
        if (number == 3):

            u, y_reduced = pca.pca(Y, 2)
            x = y_reduced[0, :]
            y = y_reduced[1, :]
            plt.scatter(np.asarray(x), np.asarray(y))
            plt.show()

        if (number == 4):

            U, y_2d = pca.pca(Y, d)
            result = kmeans.k_means(np.matrix(y_2d), k, r)
            x1 = []
            y1 = []
            x2 = []
            y2 = []

            for i in range(N):
                if (result[i] == 0):
                    x1.append(y_2d[0, i])
                    y1.append(y_2d[1, i])
                else:
                    x2.append(y_2d[0, i])
                    y2.append(y_2d[1, i])

            plt.scatter(x1, y1, color='red')
            plt.scatter(x2, y2, color='blue')
            plt.show()

        if (number == 5):
            kernel = pca.get_kernel(Y)
            u = pca.kernel_pca(kernel, d)
            y_reduced = np.matrix(kernel * u)
            result = kmeans.k_means(y_reduced.T, k, r)

            x1 = []
            y1 = []
            x2 = []
            y2 = []

            #U, y_2d = pca.pca(Y, d)
            y_2d = y_reduced.T
            for i in range(N):
                if (result[i] == 0):
                    x1.append(y_2d[0, i])
                    y1.append(y_2d[1, i])
                else:
                    x2.append(y_2d[0, i])
                    y2.append(y_2d[1, i])

            plt.scatter(x1, y1, color='red')
            plt.scatter(x2, y2, color='blue')
            plt.show()

        if (number == 6):
            W = np.matrix(spectral.get_w_matrix(Y, 5, 1))
            result = spectral.spectral_cluster(W, 2)

            x1 = []
            y1 = []
            x2 = []
            y2 = []

            U, x = pca.pca(Y, 2)
            y_2d = x
            for i in range(200):
                if (result[i] == 0):
                    x1.append(y_2d[0, i])
                    y1.append(y_2d[1, i])
                else:
                    x2.append(y_2d[0, i])
                    y2.append(y_2d[1, i])

            plt.scatter(x1, y1, color='red')
            plt.scatter(x2, y2, color='blue')
            plt.show()
        else:
            idf[j] = math.log(idf[j])
for i in range(len(urls)):
    for j in range(len(taglist)):
        tf[i][j] *= idf[j]

t = np.array(tf)
t = t.transpose()
mat,trans = pca_min.pca(t,3)
plot([trans[:9],trans[9:17],trans[17:]])
#print mat
#print trans
##m,r,a = kmeans.k_means(np.array(tf),2)
##print r,a
##c0 = np.array([[trans[i][0],trans[i][1],trans[i][2]] for i in range(len(trans)) if a[i]==0])
##c1 = np.array([[trans[i][0],trans[i][1],trans[i][2]] for i in range(len(trans)) if a[i]==1])
##plot([c0,c1])
m,r,a = kmeans.k_means(np.array(trans),3)
for i in range(10):
    m1,r1,a1 = kmeans.k_means(np.array(trans),3)
    print r1,a1
    if(r1<r):
        m = m1
        r = r1
        a = a1
print r,a
c0 = np.array([[trans[i][0],trans[i][1],trans[i][2]] for i in range(len(trans)) if a[i]==0])
c1 = np.array([[trans[i][0],trans[i][1],trans[i][2]] for i in range(len(trans)) if a[i]==1])
c2 = np.array([[trans[i][0],trans[i][1],trans[i][2]] for i in range(len(trans)) if a[i]==2])
plot([c0,c1,c2])
Beispiel #33
0
from scipy import cluster
from sklearn.datasets import load_iris
import kmeans
from matplotlib import pyplot as plt
from os import mkdir

iris = load_iris()
data = iris.data
names = iris.feature_names

np.random.shuffle(data)
k = 3

t = time.time()
centers, center_steps = kmeans.k_means(data, k=k, distance='e')
print(time.time() - t)

for step in range(len(center_steps)):
    print("{0}/{1}".format(step + 1, len(center_steps)))
    for combination in itertools.combinations(range(data.shape[1]), 2):
        step_centers = np.array(center_steps[step])
        assignment, cdist = cluster.vq.vq(data[:, combination], 
                                          step_centers[:, combination])

        if not os.path.exists(names[combination[0]] + "-" 
            + names[combination[1]]):
            mkdir(names[combination[0]] + "-" + names[combination[1]])
        
        for i in range(k):
            data = np.append(data, [step_centers[i]], axis=0)
Beispiel #34
0
k = 2

edgelist = pd.read_csv(path + file[0], delimiter=' ', skiprows=1, header=None)
G = nx.from_pandas_edgelist(edgelist,
                            source=0,
                            target=1,
                            create_using=nx.Graph())
adj_matr = nx.to_pandas_adjacency(G, dtype=np.float64)
laplacian = sparse.csgraph.laplacian(adj_matr.values, normed=True)

eigenvalues, eigenvectors = np.linalg.eig(laplacian)
eigenvectors = eigenvectors.astype(np.float64)

centroids, clusters = k_means(eigenvectors,
                              k,
                              random_seed=1,
                              num_iters=10,
                              plot=False)

print(centroids)

cut_edges = 0
for i in range(edgelist.shape[0]):
    if (clusters[edgelist[0][i]] != clusters[edgelist[1][i]]):
        cut_edges += 1

counter = collections.Counter(clusters)
smallest = 100000
for key, value in counter.items():
    if (value < smallest):
        smallest = value
Beispiel #35
0
 plt.subplot(121)
 plt.axis('off')
 plt.imshow(img)
 plt.subplot(122)
 c = np.loadtxt('data/0.txt')
 plt.axis('off')
 plt.imshow(c)
 plt.show()
 pos = np.zeros((img.shape[0], img.shape[1], 2))
 for i in range(pos.shape[0]):
     for j in range(pos.shape[1]):
         pos[i, j, :] = [i, j]
 pos = pos.reshape((-1, 2))
 data = np.hstack((pos, np.reshape(img, (-1, 3))))
 # k_means
 res = k_means(data, 3, iter_times=20, dist_func=dist)
 tag = res[:, -1]
 tag = np.reshape(tag, c.shape)
 plt.figure(2)
 plt.imshow(tag)
 plt.axis('off')
 plt.show()
 # gmm
 res = gmm(data, 3, iter_times=20)
 tag = res[:, -1]
 tag = np.reshape(tag, c.shape)
 plt.figure(3)
 plt.imshow(tag)
 plt.axis('off')
 plt.show()
 # dbscan
train_data = data[[x_axis, y_axis]].values.reshape((n_data, 2))

#Normalizing data columns so everything exists between -1 and 1
train_data_mean = np.mean(np.abs(train_data), axis=0)
for j in range(train_data.shape[1]):
    train_data[:, j] -= train_data_mean[j]
    train_data_scale = np.max(train_data, axis=0)
for j in range(train_data.shape[1]):
    train_data[:, j] /= train_data_scale[j]

# Enter k_means inputs.
clusters = 1  # Number of clusters into which we want to split our training dataset.
iterations = 50  # maximum number of training iterations.

# Init k_means instance.
k_means = k_means(train_data, clusters)

# Train k_means instance.
#(centroids, nearest_centroid) = k_means.train(iterations)
(centroids, nearest_centroid) = k_means.kmeansOpt()

# Denormalizing column vectors
for j in range(train_data.shape[1]):
    train_data[:, j] *= train_data_scale[j]
    centroids[:, j] *= train_data_scale[j]
for j in range(train_data.shape[1]):
    train_data[:, j] += train_data_mean[j]
    centroids[:, j] += train_data_mean[j]

# Plot actual clusters for reference
plt.subplot(2, 2, 3)
Beispiel #37
0
print("took: ", time2 - time1)
print("constructing the cost-matrix")
dist_matrix = get_dist_matrix(x, y, z)
time3 = time.time()
print("took: ", time3 - time2)
#print (dist_matrix)

number_of_clusters = 3
show_3d_data(A,
             cmap="viridis",
             savename="data_k_" + str(number_of_clusters) + "_b_" +
             str(A.shape[0]))

random_start = torch.randint(A.shape[0], [number_of_clusters], device=device)

cluster_assign, data = k_means(A.reshape((A.shape[0], x * y * z)), dist_matrix,
                               number_of_clusters, x, y, z, regularizer)
print("assignment to clusters: ", cluster_assign)
labels = [
    "cluster_start_" + str(random_start[i].item())
    for i in range(0, data.shape[0])
]

# time_loc_start = time.time()
# wasbary_1 = compute_wasbary(A, verbose=True, method='pot',numItermax=maxiter, reg= reg)
# time_loc_end = time.time()
# print ("computation of barycenter with pot took ", time_loc_end-time_loc_start)

#time_loc_start = time.time()
#wasbary_1 = compute_wasbary(A, verbose=True, method='ipot',numItermax=10,reg=reg)
#time_loc_end = time.time()
#print ("computation of barycenter with ipot took ", time_loc_end-time_loc_start)