def main(): """ This function will -load data from a csv -impute missing data with the column's mean -perform kmneans clustering -produce an html scatter plot """ #load data from a CSV to a dataframe with open(settings["crime_data"]) as in_data: crime_data = pd.DataFrame.from_csv(in_data, sep=',') crime_data=crime_data.fillna(value=-999) #load all numeric data into an array. The offense column from the crime data #is excluded as_array = np.asfarray(crime_data[["X","Y"]]) #number of groups n_clusters=40 #Correct missing data imputer = Imputer(missing_values=-999, strategy="mean") patched = imputer.fit_transform(as_array) #cluster data cluster = KMeans(n_clusters=n_clusters) cluster.fit(patched) #assigned grouped labels to the crime data labels = cluster.labels_ crime_data["labels"]=labels pdict = create_ordered_dict(crime_data, "labels") crime_data.to_csv(r'C:\users\andrew_woizesko\desktop\knn.csv') np.savetxt(r'C:\users\andrew_woizesko\desktop\centers.csv', cluster.cluster_centers_) #location of output graph file_name = os.path.join("..", 'tests', "kmeans_clusters_{0}.html".format(time_stamp())) output_file(file_name) #create out graph TOOLS="pan,wheel_zoom,box_zoom,reset" scatter = Scatter(pdict.values(), title="Crime Clusters", filename=file_name, tools=TOOLS) scatter.show()
from bokeh.sampledata.iris import flowers from bokeh.charts import Scatter # we fill a df with the data of interest and create a groupby pandas object df = flowers[["petal_length", "petal_width", "species"]] xyvalues = g = df.groupby("species") # here we only drop that groupby object into a dict .. pdict = OrderedDict() for i in g.groups.keys(): labels = g.get_group(i).columns xname = labels[0] yname = labels[1] x = getattr(g.get_group(i), xname) y = getattr(g.get_group(i), yname) pdict[i] = zip(x, y) # any of the following commented are valid Scatter inputs #xyvalues = pdict #xyvalues = pd.DataFrame(xyvalues) #xyvalues = xyvalues.values() #xyvalues = np.array(xyvalues.values()) TOOLS = "resize,crosshair,pan,wheel_zoom,box_zoom,reset,previewsave" scatter = Scatter(xyvalues, filename="iris_scatter.html", tools=TOOLS, ylabel='petal_width') scatter.show()