コード例 #1
0
ファイル: cluster.py プロジェクト: jhli973/dc-crimebusters
def main():
    """
    This function will
        -load data from a csv
        -impute missing data with the column's mean
        -perform kmneans clustering
        -produce an html scatter plot
    """
    
    #load data from a CSV to a dataframe
    with open(settings["crime_data"]) as in_data:
        crime_data = pd.DataFrame.from_csv(in_data, sep=',')
    
    crime_data=crime_data.fillna(value=-999)
    
    #load all numeric data into an array. The offense column from the crime data
    #is excluded
    as_array = np.asfarray(crime_data[["X","Y"]])
    
    #number of groups
    n_clusters=40
    
    #Correct missing data 
    imputer = Imputer(missing_values=-999, strategy="mean")
    patched = imputer.fit_transform(as_array)
    
    #cluster data 
    cluster = KMeans(n_clusters=n_clusters)
    cluster.fit(patched)
    
    #assigned grouped labels to the crime data
    labels = cluster.labels_
    crime_data["labels"]=labels
    
    pdict = create_ordered_dict(crime_data, "labels")
    
    crime_data.to_csv(r'C:\users\andrew_woizesko\desktop\knn.csv')
    np.savetxt(r'C:\users\andrew_woizesko\desktop\centers.csv', cluster.cluster_centers_)
    
    #location of output graph
    file_name = os.path.join("..", 'tests', "kmeans_clusters_{0}.html".format(time_stamp()))
    
    output_file(file_name)
    
    #create out graph
    TOOLS="pan,wheel_zoom,box_zoom,reset"
    scatter = Scatter(pdict.values(), title="Crime Clusters", filename=file_name, tools=TOOLS)
    scatter.show()
コード例 #2
0
from bokeh.sampledata.iris import flowers
from bokeh.charts import Scatter

# we fill a df with the data of interest and create a groupby pandas object
df = flowers[["petal_length", "petal_width", "species"]]
xyvalues = g = df.groupby("species")

# here we only drop that groupby object into a dict ..
pdict = OrderedDict()

for i in g.groups.keys():
    labels = g.get_group(i).columns
    xname = labels[0]
    yname = labels[1]
    x = getattr(g.get_group(i), xname)
    y = getattr(g.get_group(i), yname)
    pdict[i] = zip(x, y)

# any of the following commented are valid Scatter inputs
#xyvalues = pdict
#xyvalues = pd.DataFrame(xyvalues)
#xyvalues = xyvalues.values()
#xyvalues = np.array(xyvalues.values())

TOOLS = "resize,crosshair,pan,wheel_zoom,box_zoom,reset,previewsave"
scatter = Scatter(xyvalues,
                  filename="iris_scatter.html",
                  tools=TOOLS,
                  ylabel='petal_width')
scatter.show()