def main(): # NOTE(Jovan): Load data data = pd.read_csv("data/skincancer.csv", delimiter=',', index_col=0) mort = data.Mort.values lat = data.Lat.values lon = data.Long.values # NOTE(Jovan): Init LinearRegression and predict lin_reg = LinearRegression(lat, mort) hawaii = lin_reg.predict(20) print("Prediction for hawaii[lat=20]:", hawaii) # NOTE(Jovan): Init KMeans and add lat and long points k_means = KMeans() for i, j in zip(lat, lon): k_means.points.append(Point(i, j)) k_means.split(2, 0.01) # NOTE(Jovan): Plot clusters fig = plt.figure() ax = fig.add_axes([0,0,1,1]) # NOTE(Jovan): First clusters for p in k_means._clusters[0].points: ax.scatter(p.x, p.y, c="#ff0000") # NOTE(Jovan): Second clusters for p in k_means._clusters[1].points: ax.scatter(p.x, p.y, c="#00ff00") # NOTE(Jovan): Plot cluster centers center1 = k_means._clusters[0].center center2 = k_means._clusters[1].center ax.scatter(center1.x, center1.y, marker="P", c="#ff0000") ax.scatter(center2.x, center2.y, marker="P", c="#00ff00") plt.show()
def spectral_inner(W, k): m = numpy.size(W, 1) D = numpy.diag(W * numpy.ones((m, 1))) L = D - W eigenvalues, eigenvectors = numpy.linalg.eig(L) indices = numpy.argsort(eigenvalues) eigen_sorted = eigenvalues[indices] # Calculate the optimal k eigen_diff = [] for i in xrange(0, len(eigen_sorted) - 2): eigen_diff.append(eigen_sorted[i + 1] - eigen_sorted[i]) k = eigen_diff.index(max(eigen_diff)) + 1 import pdb pdb.set_trace() print "%d, %d" % (len(W[0]), k) Ut = eigenvectors[:, indices[0:k]] f = numpy.vectorize(lambda x: x.real) Ut = f(Ut) C = kmeans([Point(x) for x in Ut], k) A = [] for c in C: cluster = [] for x in c.points: cluster.append(Ut.tolist().index(x.coords.tolist())) A.append(cluster) return A
def get_data(): file_name = 'Sales_Transactions_Dataset_Weekly.csv' with open(file_name) as f: header = f.readline() points = [] for line in f: items = line.strip().split(',') r = [float(item) for item in items[1:]] points.append(Point(r)) random.shuffle(points) return points
def get_iris_data(): file_name = 'datasets/iris.csv' with open(file_name) as f: header = f.readline() points = [] for line in f: items = line.strip().split(',') r = (float(items[0]), float(items[1]), float(items[2]), float(items[3]), items[4]) points.append(Point(r)) random.shuffle(points) return points
def get_s1(): file_name = 'datasets/s1.txt' with open(file_name) as f: #header = f.readline() points = [] for line in f: items = line.strip().split(' ') r = [ float(items[0]), float(items[1]), ] points.append( Point(r) ) #random.shuffle(points) return points
def parse_input_file(input_file): """ Parses the input file based on following format. First line contains two integers(m and n space separated), indicating number of commuters and cabs. Next m lines contain commuter locations. Next n lines contain cab locations. Finally, last line contains destination location. Locations are in format: x,y Returns a tuple of commuters, cab and destination location. """ commuters = [] cabs = [] destination = None try: f = open(input_file, 'r') except (OSError, IOError) as e: if e.errno == 2: print ERR_INPUT_FILE_NOT_FOUND else: print e.strerror sys.exit(1) lines_list = f.read().splitlines() lines_len = len(lines_list) try: if lines_len == '0': raise Exception(ERR_INPUT_FILE_EMPTY) # Parse m and n values try: m = int(lines_list[0].split(' ')[0]) n = int(lines_list[0].split(' ')[1]) except: raise Exception(ERR_COMMUTERS_CABS_STR % lines_list[0]) # Less number of commuter locations if lines_len < (m+1): raise Exception(ERR_LESS_COMMUTER_LOCATIONS % (m, lines_len-1)) # Less number of commuter locations if lines_len < (m+n+1): raise Exception(ERR_LESS_CAB_LOCATIONS % (n, lines_len-m-1)) # Destination location missing if lines_len < (m+n+2): raise Exception(ERR_MISSING_DEST_LOCATION) # Parse commuter locations for i in range(m): coords_str = lines_list[i+1] coords = Point.get_coords_from_str(coords_str) commuter = Commuter(coords) commuters.append(commuter) # Parse cab locations for i in range(n): coords_str = lines_list[i+1+m] coords = Point.get_coords_from_str(coords_str) cab = Cab(coords) cabs.append(cab) # Parse destination location coords_str = lines_list[1+m+n] coords = Point.get_coords_from_str(coords_str) destination = Point(coords) except Exception as e: print e sys.exit(1) return (commuters, cabs, destination)
import json from kmeans import Point, run if __name__ == "__main__": sortedPoints = lambda ps: sorted(ps, key=lambda p: (p.x, p.y)) with open("../points.json") as f: points = map(lambda x: Point(x[0], x[1]), json.loads(f.read())) result = run(points, 10) for k in sortedPoints(result.keys()): print "==\n# %s #" % k print '\n'.join(" " + str(p) for p in sortedPoints(result[k]))
# Md Lutfar Rahman # [email protected] # DataMining Assingment 4 from kmeans import Point, Cluster, KMeans import random from UserMatrix import UserMatrix userMat = UserMatrix() points = userMat.userpoints fet = list(range(len(userMat.movieIds))) k=3 #print(fet) Point.set_features(*fet) model = KMeans(points, k, 0.001) model.cluster() #print("clustring>>ended") print('') model.getIntraCentriodDensity() print('') model.getInterCentroidDensity()
import kmeans.Point if __name__ == '__main__': p = Point([1, 2, 3, 4, 5])
x.append([a, b, c]) X.extend(x) X = array(X)[:N] return X if DISTRIB == 'RANDOM': set_x, set_y = [random.choice(chemical_symbols) for i in range(N) ], [random.choice(chemical_symbols) for i in range(N)] set_z = [round(random.uniform(0.1, 15.0), 2) for i in range(N)] data, ref = [], [] for i in range(N): formula = set_x[i] + set_y[i] set_x[i] = get_element_group(chemical_symbols.index(set_x[i])) set_y[i] = get_element_group(chemical_symbols.index(set_y[i])) data.append(Point([set_x[i], set_y[i], set_z[i]], formula)) ref.append([set_x[i], set_y[i], set_z[i]]) else: nte = len(chemical_symbols) G = gaussian_distribution(N, k_from_n(N)) set_x = (G[:, 0] + 1) / 2 * nte set_x = map(lambda x: int(math.floor(x)), set_x.tolist()) set_y = (G[:, 1] + 1) / 2 * nte set_y = map(lambda x: int(math.floor(x)), set_y.tolist()) set_z = (G[:, 2] + 1) / 2 * 15 set_z = map(lambda x: round(x, 2), set_z.tolist())
return sum(s)/len(s) #----------------------------------------------------------- def get_s1(): file_name = 'datasets/s1.txt' with open(file_name) as f: #header = f.readline() points = [] for line in f: items = line.strip().split(' ') r = [ float(items[0]), float(items[1]), ] points.append( Point(r) ) #random.shuffle(points) return points #----------------------------------------------------------- points = get_s1() #print(points) Point.set_features(0,1) for k in range(3, 4): model = KMeans(points, 15, 0.01) model.cluster() # model.show() print("Done") print('k = ', k, 'silhouette = ', silhouette(model.points, model.clusters))
.join(model.Structure, model.Atom.struct_id == model.Structure.struct_id) \ .join(model.Electrons, model.Electrons.checksum == model.Structure.checksum) \ .join(model.Struct_ratios, model.Electrons.checksum == model.Struct_ratios.checksum) \ .join(model.Energy, model.Electrons.checksum == model.Energy.checksum) \ .join(emin_query, and_( model.Energy.total == emin_query.c.emin, model.Struct_ratios.chemical_formula == emin_query.c.chemical_formula, model.Struct_ratios.formula_units == emin_query.c.formula_units )).filter(model.Struct_ratios.nelem == 2, model.Electrons.gap > 0) \ .order_by(model.Electrons.gap) \ .all(): i += 1 collected.append(get_element_group(elnum)) if not i % 2: collected.sort() data.append(Point(collected + [gap], reference=formula)) collected = [] with open(points_file, "w") as s: s.write("x,y,z,label\n") for n, pnt in enumerate(data): s.write(",".join(map(str, pnt.coords) + [pnt.reference]) + "\n") clusters = kmeans(data, k_from_n(len(data))) with open(cluster_file, "w") as s: s.write("x,y,z,label\n") for n, cluster in enumerate(clusters, 1): for pnt in cluster.points: s.write(",".join(map(str, pnt.coords) + [pnt.reference]) + "\n") s.write("-,-,-,-\n")
from sklearn.datasets import load_digits from sklearn.metrics import fowlkes_mallows_score from sklearn.cluster import AgglomerativeClustering, AffinityPropagation from kmeans import kmeans, Point, predict data, target = load_digits(return_X_y=True) # K-Means kmeans_data = [Point(val) for val in data] k_means = kmeans(kmeans_data, 10) labels = [] for point in data: labels.append(predict(k_means, Point(point))) target = [int(num) for num in target] results = [[0 for _ in range(10)] for __ in range(10)] for i, val in enumerate(labels): results[target[i]][val] += 1 conversion = {} for t_i, targ in enumerate(results): max_cluster = None for c_i, cluster in enumerate(targ): if max_cluster is None or cluster > targ[max_cluster]: max_cluster = c_i conversion[t_i] = max_cluster
dfrm = dfrm[dfrm['Units'] == 'eV'] dfrm = dfrm[(dfrm['Bandgap'] > 0) & (dfrm['Bandgap'] < 20)] avgbgfrm = dfrm.groupby('Formula')['Bandgap'].mean().to_frame().reset_index( ).rename(columns={'Bandgap': 'AvgBandgap'}) dfrm = dfrm.merge(avgbgfrm, how='outer', on='Formula') dfrm.drop_duplicates('Formula', inplace=True) dfrm.sort_values('Formula', inplace=True) fitdata, export_data = [], [] for n, row in dfrm.iterrows(): groupA, groupB = \ get_element_group(chemical_symbols.index(row['Elements'][0])), \ get_element_group(chemical_symbols.index(row['Elements'][1])) fitdata.append( Point(sorted([groupA, groupB]) + [round(row['AvgBandgap'], 2)], reference=row['Formula'])) clusters = kmeans(fitdata, k_from_n(len(fitdata))) for cluster_n, cluster in enumerate(clusters, start=1): for pnt in cluster.points: export_data.append(pnt.coords + [pnt.reference] + [cluster_n]) export = MPDSExport.save_plot( export_data, ['groupA', 'groupB', 'bandgap', 'compound', 'cluster'], 'plot3d') print(export)
#----------------------------------------------------------- def get_iris_data(): file_name = 'datasets/iris.csv' with open(file_name) as f: header = f.readline() points = [] for line in f: items = line.strip().split(',') r = [ float(items[0]), float(items[1]), float(items[2]), float(items[3]), items[4] ] points.append(Point(r)) random.shuffle(points) return points #----------------------------------------------------------- points = get_iris_data() Point.set_features(0, 1, 2, 3) for k in range(2, 10): model = KMeans(points, k, 0.01) model.cluster() # model.show() print('k = ', k, 'silhouette = ', silhouette(model.points, model.clusters))
def cluster(doc_ids, lexicon, r, num_clusters, verbose=True): words = {} t = r.terms() if verbose is True: print "extracting unique words in the document list" i = 0 while t.next(): if verbose is True: print "word " + str(i) + " is " + str(t.term().text()) I = lexicon[str(t.term().text())] # extract the lexicon of the term for doc_id, doc_feat in I: # for every document that carries the term if int(doc_id) in doc_ids: words[i] = str(t.term().text()) break i = i + 1 if verbose is True: print "creating doc space" # create a zero featurespace doc_space = OrderedDict() for doc in doc_ids: doc_space[doc] = [0] * len(words.keys()) # create a featurespace of data in ditionaries. w = 0 for key in words.keys(): I = lexicon[words[key]] for doc_id, doc_feat in I: if int(doc_id) in doc_ids: doc_space[int(doc_id)][w] = doc_feat w = w + 1 num_points = len(doc_ids) dimensions = len(words.keys()) if verbose is True: print "creating featurespace" opt_cutoff = 0.5 points = [Point(doc_space[doc], doc) for doc in doc_ids] # Cluster those data! clusters = kmeans(points, num_clusters, opt_cutoff, verbose=verbose) for i, c in enumerate(clusters): for p in c.points: print " cluster: ", i, "\t document [", p.id, "]" clus_list = OrderedDict() clus_list[0] = [] clus_list[1] = [] clus_list[2] = [] clus = 0 # Print our clusters word_hists = OrderedDict() clus = 0 for i, c in enumerate(clusters): word_hists[clus] = {} for p in c.points: clus_list[clus].append(p.id) clus = clus + 1 for key in words.keys(): I = lexicon[words[key]] # extract the lexicon of the term if verbose is True: print "word is : " + words[key] for doc_id, doc_feat in I: # for every document that carries the term hit = False if int(doc_id) in doc_space.keys(): for clus in clus_list.keys(): if (int(doc_id) in clus_list[clus]) and hit is False: if not words[key] in word_hists[clus].keys(): word_hists[clus][words[key]] = doc_feat else: word_hists[clus][words[key]] += doc_feat hit = True clus_sort = {} for clus in clus_list.keys(): clus_sort[clus] = sorted(word_hists[clus], key=word_hists[clus].get) import pdb pdb.set_trace() return clusters