def test_manhattan_other(self): formatted_data = format_stock_data([self.test_stock, self.test_stock2]) # create centroid located at self centroid = (formatted_data['returns'][1], formatted_data['dividends'][1]) # calculate distance manhattan_d = manhattan_dist(formatted_data['returns'][0], formatted_data['dividends'][0], centroid) # distance to self should be 0 self.assertEqual(manhattan_d, 38.52092060801694)
def test_k_means_2(self): formatted_data = format_stock_data([self.test_stock, self.test_stock2]) # perform the clustering clusters = k_means(formatted_data, 2) # check results are expected - one cluster on each stock # Note can make no assumptions about order! self.assertIn((-40.938865746148089, 0.0), clusters) self.assertIn((-3.6179451381311551, 1.2), clusters)
def test_pearson_other(self): formatted_data = format_stock_data([self.test_stock, self.test_stock2]) # create centroid located at self centroid = (formatted_data['returns'][1], formatted_data['dividends'][1]) # calculate distance pearson_d = pearson_dist(formatted_data['returns'][0], formatted_data['dividends'][0], centroid) # distance to self should be non-zero self.assertEqual(pearson_d, 1)
def test_euclidean_self(self): formatted_data = format_stock_data([self.test_stock, self.test_stock2]) # create centroid located at self centroid = (formatted_data['returns'][0], formatted_data['dividends'][0]) # calculate distance euclidean_d = euclidean_dist(formatted_data['returns'][0], formatted_data['dividends'][0], centroid) # distance to self should be 0 self.assertEqual(euclidean_d, 0)
def test_euclidean_other(self): formatted_data = format_stock_data([self.test_stock, self.test_stock2]) # create centroid located at other centroid = (formatted_data['returns'][1], formatted_data['dividends'][1]) # calculate distance euclidean_d = euclidean_dist(formatted_data['returns'][0], formatted_data['dividends'][0], centroid) # distance to self should be 0 self.assertEqual(euclidean_d, 37.34020775290227)
def test_hierarchical_cluster(self): formatted_data = format_stock_data([self.test_stock, self.test_stock2]) # perform the clustering clusters = hierarchical_clustering(formatted_data) # should be a single cluster (combine two stocks) self.assertEqual(clusters[0][0], 0.0) self.assertEqual(clusters[0][1], 1.0) # Euclidean distance should be distance between two members of top-level cluster self.assertEqual(clusters[0][2], 37.34020775290227)
def test_assign_centroids(self): formatted_data = format_stock_data([self.test_stock, self.test_stock2]) # create centroids with small offset centroid0 = (formatted_data['returns'][0] + 2.0, formatted_data['dividends'][0] + 1.0) centroid1 = (formatted_data['returns'][1] + 2.5, formatted_data['dividends'][1] + 1.2) # assign stocks to nearest centroid assigns = assign_to_centroids(formatted_data, [centroid0, centroid1], euclidean_dist) # check stocks assigned to centroid made near its location self.assertEqual(assigns[centroid0], [0]) self.assertEqual(assigns[centroid1], [1])
def kmeans_cluster_stocks(k): """ API endpoint to cluster stocks using k-means :param k: number of clusters :return: list of cluster centroids """ stock_data = mongo.db.stocks.find() # check stock was found in db if stock_data is None: abort(404) clusters = k_means(format_stock_data(list(stock_data)), k) return jsonify({"centroids": clusters}), 200
def test_reassign_centroids(self): formatted_data = format_stock_data([self.test_stock, self.test_stock2]) # create centroids with small offset centroid0 = (formatted_data['returns'][0] + 2.0, formatted_data['dividends'][0] + 1.0) centroid1 = (formatted_data['returns'][1] + 2.5, formatted_data['dividends'][1] + 1.2) # assign stocks to nearest centroid assigns = assign_to_centroids(formatted_data, [centroid0, centroid1], euclidean_dist) # re-calculate the centroid location new_centroids = adjust_centroids(formatted_data, [centroid0, centroid1], assigns) # since each centroid had one member should be at that (indiviudal stock) location now self.assertEqual(new_centroids[0], (-3.6179451381311551, 1.2)) self.assertEqual(new_centroids[1], (-40.938865746148089, 0.0))
def hierarchy_cluster_stocks(): """ API endpoint to cluster stocks using k-means :param k: number of clusters :return: list of cluster centroids """ stock_data = mongo.db.stocks.find() # check stock was found in db if stock_data is None: abort(404) # format data and do clustering cluster_data = format_stock_data(list(stock_data)) clusters = hierarchical_clustering(cluster_data) return jsonify({"clusters": clusters}), 200
def hierarchy_cluster_stocks_plot(): """ API endpoint to cluster stocks using k-means :param k: number of clusters :return: list of cluster centroids """ stock_data = mongo.db.stocks.find() # check stock was found in db if stock_data is None: abort(404) # format data and do clustering cluster_data = format_stock_data(list(stock_data)) # make the plot res = plot_hierarchy(cluster_data) return jsonify({"plot_url": res}), 200
def get_stocks(): """ render template for stock page of site. USe Jinja to dynamically display assignment data :return: HTML rendering """ # look up stock in mongo db stock_data = list(mongo.db.stocks.find()) # check that a result was returned from db if stock_data is None: abort(404) # perform ML algos features, targets = extract_feature_data(stock_data) # create the svm and dtree svm = create_support_vector_regression(features, targets) dtree = create_decision_tree_regression(features, targets) # tack on plotly urls formatted_data = format_stock_data(stock_data) clusters = k_means(formatted_data, 3) cluster_assignments = get_cluster_assignments(formatted_data, clusters) plots = {} plots['kmeans'] = plot_clusters(formatted_data, clusters) plots['hierarchy'] = plot_hierarchy(formatted_data) plots['svm'] = plot_support_vector_regression(features, targets, svm) plots['dtree'] = plot_decision_tree_regression(features, targets, dtree) # add expected returns stock_rets = {} for stock in stock_data: features_stock, _ = extract_feature_data([stock]) if len(features_stock) == 0: stock_rets[stock['symbol']] = 0.0 else: stock_rets[ stock['symbol']] = (svm.predict(features_stock)[0] + dtree.predict(features_stock)[0]) / 2 # render the HTML return render_template('stocks.html', stocks=stock_data, plots=plots, clusters=cluster_assignments, stock_rets=stock_rets)
def test_k_means_1(self): formatted_data = format_stock_data([self.test_stock, self.test_stock2]) # perform the clustering clusters = k_means(formatted_data, 1) # check results are expected - single cluster located in middle of 2 stocks self.assertEqual(clusters[0], (-22.278405442139622, 0.59999999999999998))
def test_format_cluster_data_multiple(self): formatted_data = format_stock_data([self.test_stock, self.test_stock2]) # test that three lists properly created self.assertEqual(formatted_data['dividends'], [1.2, 0.0]) self.assertEqual(formatted_data['returns'], [-3.617945138131155, -40.93886574614809]) self.assertEqual(formatted_data['names'], ['LL', 'L2'])
def test_format_cluster_data_single(self): formatted_data = format_stock_data([self.test_stock]) # test that three lists properly created self.assertEqual(formatted_data['dividends'], [1.2]) self.assertEqual(formatted_data['returns'], [-3.617945138131155]) self.assertEqual(formatted_data['names'], ['LL'])