def compute_graph_lasso_covariance(subject_id, group, session='func1', preprocessing_folder='pipeline_1', plot=True, save=True, save_file=True, msdl=False): """Returns graph lasso covariance for a subject_id """ # load timeseries if msdl: ts = load_dynacomp_msdl_timeseries( subject_id, session=session, preprocessing_folder=preprocessing_folder) roi_names, roi_coords = load_msdl_names_and_coords() else: ts = load_dynacomp_roi_timeseries( subject_id, session=session, preprocessing_folder=preprocessing_folder) # load rois roi_names, roi_coords = load_roi_names_and_coords(subject_id) # compute covariance gl = covariance.GraphLassoCV(verbose=2) gl.fit(ts) if plot: plot_connectivity_matrix(subject_id, group, gl.covariance_, roi_names, 'gl_covariance', session, preprocessing_folder, save, msdl) plot_connectivity_matrix(subject_id, group, gl.precision_, roi_names, 'gl_precision', session, preprocessing_folder, save, msdl) sparsity = (gl.precision_ == 0) plot_connectivity_matrix(subject_id, group, sparsity, roi_names, 'gl_sparsity', session, preprocessing_folder, save, msdl) plot_connectivity_glassbrain(subject_id, group, gl.covariance_, roi_coords, 'gl_covariance', session, preprocessing_folder, save, msdl) if save_file: CONN_DIR = set_data_base_dir('Dynacomp/connectivity') sparsity = (gl.precision_ == 0) if not os.path.isdir(os.path.join(CONN_DIR, subject_id)): os.mkdir(os.path.join(CONN_DIR, subject_id)) output_file = os.path.join( CONN_DIR, subject_id, 'gl_' + session + '_' + preprocessing_folder) if msdl: output_file += '_msdl' np.savez(output_file, covariance=gl.covariance_, precision=gl.precision_, sparsity=sparsity, roi_names=roi_names, roi_coords=roi_coords) return gl, roi_names, roi_coords
def affinity_propagation(df_wtarget, df_wotarget, variables): ''' This function takes in a dataframe with our target variable, a dataframe without the target variable, and the title of the result. It does an affinity propagation analysis and then prints out the clusters we've determined. ''' # Learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery X = df_wotarget.copy().T X /= X.std(axis=0) edge_model.fit(X) # ############################################################################# # Cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) df_wtarget['score'] = labels + 1 scores = df_wtarget.filter(items=variables) writer = pd.ExcelWriter('results/affinity_propagation.xlsx') scores.to_excel(writer) writer.save()
def shrinkage(): plt.imshow(cov_train_pyr[0]) plt.colorbar() plt.show() cov_train_lasso = cov_train prec_train_lasso = cov_train cov_train_oas = cov_train corr_lasso = cov_train for i in range(len(data_train)): cov_train_oas[i] = covariance.OAS().fit(data_train[i]).covariance_ # plt.imshow(cov_train[i]) # plt.colorbar() # plt.show() GLassCV = covariance.GraphLassoCV(cv=5) cov_train_lasso[i] = GLassCV.fit(data_train[i]).covariance_ prec_train_lasso[i] = GLassCV.fit(data_train[i]).precision_ corr_lasso[i] = cov2corr(prec_train_lasso[i]) print('sum of correlations: ', np.sum(np.abs(corr_lasso[i]), axis=1)) myalphas = GLassCV.cv_alphas_ print(myalphas) print(np.mean(GLassCV.grid_scores_, axis=1)) plt.imshow(corr_lasso[i]) plt.colorbar() plt.show() cov_train[i] = covariance.LedoitWolf().fit(data_train[i]).covariance_
def find_top_genes_from_pcor_network(log2_df_cell, num_iterations, gene_index_list): alpha = 0.4 keep_genes = [] for iter_genes in range(0, num_iterations): top_pca_matrix, top_pca_genes = return_top_pca_gene( log2_df_cell, range_genes=[ gene_index_list[max(0, iter_genes - 1)], gene_index_list[iter_genes + 1] ]) mean_expr_dict = {} gl = covariance.GraphLassoCV() gene_data = scale(top_pca_matrix.as_matrix()) gl.fit(gene_data) _, labels = cluster.affinity_propagation(gl.covariance_) n_labels = labels.max() names = np.array(top_pca_genes) prec_sp = gl.precision_ matrix1 = -prec_sp + np.diag(np.diagonal(prec_sp)) D = nx.Graph(matrix1) gene_weights_dict = {} for n in names: gene_weights_dict[n] = 0 for x, y in D.edges(): gene1 = names[x] gene2 = names[y] abs_weight = abs(D[x][y]['weight']) gene_weights_dict[gene1] += abs_weight gene_weights_dict[gene2] += abs_weight clust_gene_list = [] avg_wieght_list = [] for i in range(n_labels + 1): clust_id = "cluster_" + str(i + 1) w_list = [gene_weights_dict[g] for g in names[labels == i]] clust_gene_list.append([names[labels == i]]) avg_wieght_list.append(sum(w_list) / len(w_list)) print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) if iter_genes == 0: threshold = np.mean(avg_wieght_list) - np.std(avg_wieght_list) for g_list in [ clust_gene_list[i] for i, av in enumerate(avg_wieght_list) if av >= threshold ]: keep_genes.append(np.ravel(g_list)) final_gene_list = list( set([item for sublist in keep_genes for item in sublist])) print(final_gene_list) gene_matrix_log2 = log2_df_cell.T top_matrix = gene_matrix_log2[final_gene_list] top_matrix_cell = top_matrix.T return top_matrix, top_matrix_cell, final_gene_list
def get_sparse_cov(self): #calculating sparse LASSO covariance flat = self.flatten_cube_spectral(self.Z, normalise=True) model = skcv.GraphLassoCV() model.fit(flat) cov_ = model.covariance_ self.corr_mat = cov_
def learnGraphStructure(self): self.edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery # data = self.getReturns(self.prices.as_frame()) fx = FXClas.fxData() self._X = fx.getCurrencyBasketFromDB(currencies=None, periodicity='monthly', fxRisk=None) # self.X = data['returns'] self._X /= self._X.std(axis=0) self.names = self._X.columns print(self) print(self._X) self.edge_model.fit(self._X.fillna(np.mean(self._X), inplace=True))
def make_graphlasso(top_matrix, name_list): gl = covariance.GraphLassoCV() gene_data = scale(top_matrix.as_matrix()) gl.fit(gene_data) names = np.array(name_list) node_position_model = manifold.LocallyLinearEmbedding(n_components=2, eigen_solver='dense', n_neighbors=7) embedding = node_position_model.fit_transform(gene_data.T).T return gl, embedding, names
def discover_clusters(var): from sklearn import cluster, covariance # Learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV() edge_model.fit(var) # Cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() for i in xrange(n_labels + 1): print 'Cluster %i: %s' % (i, \ ', '.join(var.columns[labels == i])) del cluster, covariance return labels, edge_model.precision_.copy()
def cluster_industry(industry_return_df, start_date, end_date): industry_return_df = industry_return_df[industry_return_df.index.map( lambda x: x >= start_date and x <= end_date)] edge_model = covariance.GraphLassoCV() # edge_model.fit(industry_return_df.values) # centers, labels = cluster.affinity_propagation(edge_model.covariance_) kmeans_clf = cluster.KMeans(40) labels = kmeans_clf.fit_predict(industry_return_df.T.values) cluster_res_df = pd.DataFrame({ 'industry': industry_return_df.columns, 'label': labels }) for label, tmp_df in cluster_res_df.groupby('label'): industry_names = " ".join(tmp_df['industry'].values) print("label %s: %s" % (label, industry_names))
def correlation_between_states_2(crude, states, threshold): correlations = [] # We retrieve the correlation coefficent for every keyword, across every year we explored. diff = pd.DataFrame(crude.set_index('Year').groupby('LocationAbbr')\ .apply(lambda x: x['Data_Value'].diff())) d = diff.stack().unstack(0) if crude.shape[0] > 500: d.index = d.index.droplevel(level=1) # Learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery d = d.loc[1998:] corr = d.values X = corr.copy() X /= X.std(axis=0) edge_model.fit(X) # Transform it in a links data frame (3 columns only): corr = pd.DataFrame(edge_model.covariance_, index=states, columns=states) links = corr.stack().reset_index() links.columns = ['var1', 'var2', 'value'] # Keep only correlation over a threshold and remove self correlation (cor(A,A)=1) links_filtered = links.loc[(links['value'] > threshold) & (links['var1'] != links['var2'])] # Build your graph G = nx.from_pandas_dataframe(links_filtered, 'var1', 'var2') #pivoting to print data links = links.pivot(index='var1', columns='var2')['value'] #computing the graph partition l = nx.laplacian_matrix(G=G) L = spr.csr_matrix(l).toarray() eig, vec = np.linalg.eig(L) sort = np.argsort(np.diff(np.abs(eig))) i = 1 v = vec[:, sort[-i]].real / sum(vec[:, sort[-i]].real) label = v.astype(int) return (links, G, label)
def analysis(): now = datetime.datetime.now() ############################################################################### d1 = datetime.datetime(2016, now.month, 1) d2 = datetime.datetime(2016, now.month, 30) symbol_dict = stocks symbols, names = np.array(list(symbol_dict.items())).T quotes = [quotes_historical_yahoo(symbol, d1, d2, asobject=True) for symbol in symbols] open = np.array([q.open for q in quotes]).astype(np.float) close = np.array([q.close for q in quotes]).astype(np.float) # The daily variations of the quotes are what carry most information variation = close - open ############################################################################### # Learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) ############################################################################### # Cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() message = '' for i in range(n_labels + 1): message += 'Cluster %i: %s\r\n' % ((i + 1), ', '.join(names[labels == i])) return message
def computeCovar(bed, shrinkMethod, fitIndividuals): eigen = dict([]) if (shrinkMethod in ['lw', 'oas', 'l1', 'cv']): import sklearn.covariance as cov t0 = time.time() print 'Estimating shrunk covariance using', shrinkMethod, 'estimator...' if (shrinkMethod == 'lw'): covEstimator = cov.LedoitWolf(assume_centered=True, block_size=5 * bed.val.shape[0]) elif (shrinkMethod == 'oas'): covEstimator = cov.OAS(assume_centered=True) elif (shrinkMethod == 'l1'): covEstimator = cov.GraphLassoCV(assume_centered=True, verbose=True) elif (shrinkMethod == 'cv'): shrunkEstimator = cov.ShrunkCovariance(assume_centered=True) param_grid = {'shrinkage': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99]} covEstimator = sklearn.grid_search.GridSearchCV( shrunkEstimator, param_grid) else: raise Exception('unknown covariance regularizer') covEstimator.fit(bed.val[fitIndividuals, :].T) if (shrinkMethod == 'l1'): alpha = covEstimator.alpha_ print 'l1 alpha chosen:', alpha covEstimator2 = cov.GraphLasso(alpha=alpha, assume_centered=True, verbose=True) else: if (shrinkMethod == 'cv'): shrinkEstimator = clf.best_params_['shrinkage'] else: shrinkEstimator = covEstimator.shrinkage_ print 'shrinkage estimator:', shrinkEstimator covEstimator2 = cov.ShrunkCovariance(shrinkage=shrinkEstimator, assume_centered=True) covEstimator2.fit(bed.val.T) XXT = covEstimator2.covariance_ * bed.val.shape[1] print 'Done in %0.2f' % (time.time() - t0), 'seconds' else: print 'Computing kinship matrix...' t0 = time.time() XXT = symmetrize(blas.dsyrk(1.0, bed.val, lower=1)) print 'Done in %0.2f' % (time.time() - t0), 'seconds' try: shrinkParam = float(shrinkMethod) except: shrinkParam = -1 if (shrinkMethod == 'mylw'): XXT_fit = XXT[np.ix_(fitIndividuals, fitIndividuals)] sE2R = (np.sum(XXT_fit**2) - np.sum(np.diag(XXT_fit)**2)) / (bed.val.shape[1]**2) #temp = (bed.val**2).dot((bed.val.T)**2) temp = symmetrize( blas.dsyrk(1.0, bed.val[fitIndividuals, :]**2, lower=1)) sER2 = (temp.sum() - np.diag(temp).sum()) / bed.val.shape[1] shrinkParam = (sER2 - sE2R) / (sE2R * (bed.val.shape[1] - 1)) if (shrinkParam > 0): print 'shrinkage estimator:', 1 - shrinkParam XXT = (1 - shrinkParam) * XXT + bed.val.shape[ 1] * shrinkParam * np.eye(XXT.shape[0]) return XXT
symbol_dict = json.loads(f.read()) symbols, names = np.array(list(symbol_dict.items())).T quotes = [quotes_yahoo(symbol, start_date, end_date, asobject=True) for symbol in symbols] # Extract opening and closing quotes opening_quotes = np.array([quote.open for quote in quotes]).astype(np.float) closing_quotes = np.array([quote.close for quote in quotes]).astype(np.float) # The daily fluctuations of the quotes delta_quotes = closing_quotes - opening_quotes # Build a graph model from the correlations edge_model = covariance.GraphLassoCV() # Standardize the data X = delta_quotes.copy().T X /= X.std(axis=0) # Train the model with np.errstate(invalid='ignore'): edge_model.fit(X) # Build clustering model using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) num_labels = labels.max() # Print the results of clustering for i in range(num_labels + 1):
def contingencyTableChi2andPOISpaceStructure(dataBunch, pred, class_mapping, dbLabel, savefigFn): '''独立性检验''' mergingData = np.hstack( (pred.reshape(-1, 1), dataBunch.target.reshape(-1, 1))) #水平组合聚类预测值和行业分类类标 targetStack = [] for i in range(len( np.array(class_mapping)[..., 0])): #按行业类标重新组织数据,每行对应行业类标所有的聚类预测值 targetStack.append(mergingData[mergingData[..., -1] == int( np.array(class_mapping)[..., 0][i])]) clusterFrequency = {} for p in targetStack: #按行业类标计算每类所有点所属聚类簇的数量(频数) clusterFrequency[(p[..., -1][0])] = [(j, np.sum(p[..., 0] == int(j)) + 1) for j in dbLabel if j != -1] #独立性检验值不能为零,因此将所有值+1 # print(clusterFrequency) CTableTarget = list(clusterFrequency.keys()) CTableIdx = np.array(list(clusterFrequency.values())) CTable = CTableIdx[..., 1] #建立用于独立性分析的列联表,横向为行业类所属聚类簇频数,纵向为行业类标 totalIndependence = chi2_contingency(CTable) #列联表的独立性检验 g, p, dof, expctd = totalIndependence #提取卡方值g,p值,自由度dof和与元数据数组同维度的对应理论值。此次实验计算p=0.00120633349692,小于0.05,因此行业分类与聚类簇相关。 print(g, p, dof) '''poi的空间分布结构。参考官方案例Visualizing the stock market structure:http://scikit-learn.org/stable/auto_examples/applications/plot_stock_market.html#sphx-glr-auto-examples-applications-plot-stock-market-py''' #A-协方差逆矩阵(精度矩阵)。The matrix inverse of the covariance matrix, often called the precision matrix, is proportional to the partial correlation matrix. It gives the partial independence relationship. In other words, if two features are independent conditionally on the others, the corresponding coefficient in the precision matrix will be zero。来自官网说明摘录 edge_model = covariance.GraphLassoCV( ) #稀疏逆协方差估计器GraphLassoCV(),翻译有待数学专业确认。官网解释:http://scikit-learn.org/stable/modules/covariance.html#sparse-inverse-covariance X = CTable.copy().T print(X, X.shape) X = X / X.std(axis=0) #标准化。可以自行实验小规模数组,查看变化,分析结果,获取结论。 print(X) edge_model.fit(X) print("******************************************************************") print(edge_model.covariance_.shape) #B-affinity_propagation(AP)聚类算法是基于数据点间"信息传递"的一种聚类算法,不用预先给出cluster簇数。聚类协方差矩阵 _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() print(labels) #C-Manifold中的降维方法可以能够处理数据中的非线性结构信息。具体可以查看官网http://scikit-learn.org/stable/modules/manifold.html#locally-linear-embedding。降维的目的是降到2维,作为xy坐标值,在二维图表中绘制为点。 node_position_model = manifold.LocallyLinearEmbedding(n_components=2, eigen_solver='dense', n_neighbors=6) embedding = node_position_model.fit_transform(X.T).T print(embedding.shape) '''图表可视化poi空间分布结构''' plt.figure(1, facecolor='w', figsize=(10, 8)) plt.clf() ax = plt.axes( [0., 0., 1., 1.] ) #可以参考官方示例程序 http://matplotlib.org/examples/pylab_examples/axis_equal_demo.html plt.axis('off') # Display a graph of the partial correlations/偏相关分析:在多要素所构成的系统中,当研究某一个要素对另一个要素的影响或相关程度时,把其他要素的影响视作常数(保持不变),即暂时不考虑其他要素影响,单独研究两个要素之间的相互关系的密切程度,所得数值结果为偏相关系数。在多元相关分析中,简单相关系数可能不能够真实的反映出变量X和Y之间的相关性,因为变量之间的关系很复杂,它们可能受到不止一个变量的影响。这个时候偏相关系数是一个更好的选择。 partial_correlations = edge_model.precision_.copy() print(partial_correlations.shape) # print(partial_correlations) d = 1 / np.sqrt( np.diag(partial_correlations)) #umpy.diag()返回一个矩阵的对角线元素,计算该元素平方根的倒数。 partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02 ) #np.triu()返回矩阵的上三角矩阵。 # Plot the nodes using the coordinates of our embedding plt.scatter(embedding[0], embedding[1], s=300 * d**2, c=labels, cmap=plt.cm.Spectral) #簇类标用于定义节点的颜色,降维后数据作为点坐标 # Plot the edges start_idx, end_idx = np.where( non_zero ) #numpy.where(condition[, x, y])这里x,y是可选参数,condition是条件,这三个输入参数都是array_like的形式;而且三者的维度相同。当conditon的某个位置的为true时,输出x的对应位置的元素,否则选择y对应位置的元素;如果只有参数condition,则函数返回为true的元素的坐标位置信息; segments = [[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)] values = np.abs(partial_correlations[non_zero]) print( "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" ) print(len(segments)) print(len(values)) cm = plt.cm.get_cmap( 'OrRd' ) #具体的`matplotlib.colors.Colormap'实例可以查看matplotlib官网 http://matplotlib.org/users/colormaps.html,替换不同色系 lc = LineCollection(segments, zorder=0, cmap=cm, norm=plt.Normalize(0, .7 * values.max())) lc.set_array(values) lc.set_linewidths(15 * values) #定义边缘的强度。 ax.add_collection(lc) # Add a label to each node. The challenge here is that we want to position the labels to avoid overlap with other labels,添加行业分类标签,并避免标签重叠。 names = [i[-1] for i in class_mapping] for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)): dx = x - embedding[0] dx[index] = 1 dy = y - embedding[1] dy[index] = 1 this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: horizontalalignment = 'left' x = x + .002 else: horizontalalignment = 'right' x = x - .002 if this_dy > 0: verticalalignment = 'bottom' y = y + .002 else: verticalalignment = 'top' y = y - .002 plt.text(x, y, name, size=10, horizontalalignment=horizontalalignment, verticalalignment=verticalalignment, bbox=dict(facecolor='w', edgecolor=plt.cm.Spectral(label / float(n_labels)), alpha=.6)) plt.xlim( embedding[0].min() - .15 * embedding[0].ptp(), embedding[0].max() + .10 * embedding[0].ptp(), ) #numpy.ptp()极差函数返回沿轴的值的范围(最大值-最小值)。 plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(), embedding[1].max() + .03 * embedding[1].ptp()) plt.savefig(os.path.join(savingFig, savefigFn)) #保存打印的图表 plt.show() return CTable, np.array(partial_correlations)
def dailyStockClusters(): import datetime import os import numpy as np import pandas.io.data as web from pandas import DataFrame from matplotlib import pylab as pl from matplotlib import finance from matplotlib.collections import LineCollection from sklearn import cluster, covariance, manifold ######################################################################## ### ### This example employs several unsupervised learning techniques to ### extract the stock market structure from variations in historical quotes. ### The quantity that we use is the daily variation in quote price: ### quotes that are linked tend to co-fluctuate during a day. ### ### stocks used are all Nasdaq 100 stocks that have one year of history ### from the current date. ### ### adopted from example at: ### http://scikit-learn.org/0.14/auto_examples/applications/plot_stock_market.html ### ######################################################################## # Retrieve the data from Internet # Choose a time period reasonnably calm (not too long ago so that we get # high-tech firms, and before the 2008 crash) today = datetime.datetime.now() d1 = datetime.datetime(today.year - 1, today.month, today.day) d2 = datetime.datetime(today.year, today.month, today.day) # input symbols and company names from text file companyName_file = os.path.join(os.getcwd(), "symbols", "companyNames.txt") with open(companyName_file, "r") as f: companyNames = f.read() print "\n\n\n" companyNames = companyNames.split("\n") ii = companyNames.index("") del companyNames[ii] companySymbolList = [] companyNameList = [] symbol_dict = {} for iname, name in enumerate(companyNames): name = name.replace("amp;", "") testsymbol, testcompanyName = name.split(";") companySymbolList.append(format(testsymbol, 's')) companyNameList.append(format(testcompanyName, 's')) if testsymbol != "CASH": symbol_dict[testsymbol] = format(testcompanyName, 's') print " ... symbol_dict = ", symbol_dict symbols = companySymbolList[:] names = companyNameList[:] all_data = {} for ticker in symbols: try: all_data[ticker] = web.get_data_yahoo(ticker, d1, d2) qclose = DataFrame( {tic: data['Close'] for tic, data in all_data.iteritems()}) qopen = DataFrame( {tic: data['Open'] for tic, data in all_data.iteritems()}) except: print "Cant find ", ticker symbols_edit = [] names_edit = [] for i, ticker in enumerate(symbols): if True in np.isnan(np.array(qclose[ticker])).tolist(): print ticker, " nans found, ticker removed" del qclose[ticker] del qopen[ticker] else: symbols_edit.append(ticker) names_edit.append(names[i]) # The daily variations of the quotes are what carry most information variation = qclose - qopen variation[np.isnan(variation)] = 0. ############################################################################### # Learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery X = variation.copy() #X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) ############################################################################### # Cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() for i in range(n_labels + 1): print "Cluster " + str(i) + ":" for j in range(len(labels)): if labels[j] == i: print " ... " + names_edit[j] #print('Cluster %i: %s' % ((i + 1), ', '.join(names_edit[labels == i]))) for i in range(n_labels + 1): print "Cluster " + str(i) + ":" for j in range(len(labels)): if labels[j] == i: print " ... " + names_edit[j] figure7path = 'Clustered_companyNames.png' # re-set to name without full path figure7_htmlText = "\n<br><h3>Daily stock clustering analyis. Based on one year performance correlations.</h3>\n" figure7_htmlText = figure7_htmlText + "\nClustering based on daily variation in Nasdaq 100 quotes.\n" figure7_htmlText = figure7_htmlText + '''<br><img src="''' + figure7path + '''" alt="PyTAAA by DonaldPG" width="850" height="500"><br>\n''' ############################################################################### # Find a low-dimension embedding for visualization: find the best position of # the nodes (the stocks) on a 2D plane # We use a dense eigen_solver to achieve reproducibility (arpack is # initiated with random vectors that we don't control). In addition, we # use a large number of neighbors to capture the large-scale structure. node_position_model = manifold.LocallyLinearEmbedding(n_components=2, eigen_solver='dense', n_neighbors=6) embedding = node_position_model.fit_transform(X.T).T ############################################################################### # Visualization pl.figure(1, facecolor='w', figsize=(10, 8)) pl.clf() ax = pl.axes([0., 0., 1., 1.]) pl.axis('off') # Display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) # Plot the nodes using the coordinates of our embedding pl.scatter(embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=pl.cm.spectral) # Plot the edges start_idx, end_idx = np.where(non_zero) #a sequence of (*line0*, *line1*, *line2*), where:: # linen = (x0, y0), (x1, y1), ... (xm, ym) segments = [[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)] values = np.abs(partial_correlations[non_zero]) lc = LineCollection(segments, zorder=0, cmap=pl.cm.hot_r, norm=pl.Normalize(0, .7 * values.max())) lc.set_array(values) lc.set_linewidths(15 * values) ax.add_collection(lc) # Add a label to each node. The challenge here is that we want to # position the labels to avoid overlap with other labels for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)): dx = x - embedding[0] dx[index] = 1 dy = y - embedding[1] dy[index] = 1 this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: horizontalalignment = 'left' x = x + .002 else: horizontalalignment = 'right' x = x - .002 if this_dy > 0: verticalalignment = 'bottom' y = y + .002 else: verticalalignment = 'top' y = y - .002 pl.text(x, y, name, size=10, horizontalalignment=horizontalalignment, verticalalignment=verticalalignment, bbox=dict(facecolor='w', edgecolor=pl.cm.spectral(label / float(n_labels)), alpha=.6)) pl.xlim( embedding[0].min() - .15 * embedding[0].ptp(), embedding[0].max() + .10 * embedding[0].ptp(), ) pl.ylim(embedding[1].min() - .03 * embedding[1].ptp(), embedding[1].max() + .03 * embedding[1].ptp()) pl.savefig(os.path.join(os.getcwd(), "pyTAAA_web", "Clustered_companyNames.png"), format='png') return figure7_htmlText
def relation_plot(self, df, good_list): close_price_list = [ df[df.code == code].close.tolist() for code in good_list ] close_prices = np.vstack(close_price_list) open_price_list = [ df[df.code == code].open.tolist() for code in good_list ] open_prices = np.vstack(open_price_list) # the daily variations of the quotes are what carry most information variation = (close_prices - open_prices) * 100 / open_prices logger.info("get variation succeed") # ############################################################################# # learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance is more efficient for structure recovery X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) logger.info("mode compute succeed") # ############################################################################# # cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() code_list = np.array(good_list) industry_dict = dict() industry_df_info = IndustryInfo.get() for index, name in industry_df_info.name.iteritems(): content = industry_df_info.loc[index]['content'] a_code_list = json.loads(content) for code in a_code_list: industry_dict[code] = name cluster_dict = dict() for i in range(n_labels + 1): cluster_dict[i] = code_list[labels == i] name_list = [ CStockInfo.get(code, 'name') for code in code_list[labels == i] ] logger.info('cluster code %i: %s' % ((i + 1), ', '.join(name_list))) cluster_info = dict() for group, _code_list in cluster_dict.items(): for code in _code_list: iname = industry_dict[code] if group not in cluster_info: cluster_info[group] = set() cluster_info[group].add(iname) logger.info('cluster inustry %i: %s' % ((i + 1), ', '.join(list(cluster_info[group])))) # ############################################################################# # find a low-dimension embedding for visualization: find the best position of # the nodes (the stocks) on a 2D plane # we use a dense eigen_solver to achieve reproducibility (arpack is # initiated with random vectors that we don't control). In addition, we # use a large number of neighbors to capture the large-scale structure. node_position_model = manifold.LocallyLinearEmbedding( n_components=2, eigen_solver='dense', n_neighbors=6) embedding = node_position_model.fit_transform(X.T).T # ############################################################################# # visualizatio plt.figure(1, facecolor='w', figsize=(10, 8)) plt.clf() ax = plt.axes([0., 0., 1., 1.]) plt.axis('off') # display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) # plot the nodes using the coordinates of our embedding plt.scatter(embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=plt.cm.nipy_spectral) # plot the edges start_idx, end_idx = np.where(non_zero) # a sequence of (*line0*, *line1*, *line2*), where:: linen = (x0, y0), (x1, y1), ... (xm, ym) segments = [[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)] values = np.abs(partial_correlations[non_zero]) lc = LineCollection(segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, .7 * values.max())) lc.set_array(values) lc.set_linewidths(15 * values) ax.add_collection(lc) # add a label to each node. The challenge here is that we want to position the labels to avoid overlap with other labels for index, (name, label, (x, y)) in enumerate(zip(code_list, labels, embedding.T)): dx = x - embedding[0] dx[index] = 1 dy = y - embedding[1] dy[index] = 1 this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: horizontalalignment = 'left' x = x + .002 else: horizontalalignment = 'right' x = x - .002 if this_dy > 0: verticalalignment = 'bottom' y = y + .002 else: verticalalignment = 'top' y = y - .002 plt.text(x, y, name, size=10, horizontalalignment=horizontalalignment, verticalalignment=verticalalignment, bbox=dict(facecolor='w', edgecolor=plt.cm.nipy_spectral(label / float(n_labels)), alpha=.6)) plt.xlim( embedding[0].min() - .15 * embedding[0].ptp(), embedding[0].max() + .10 * embedding[0].ptp(), ) plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(), embedding[1].max() + .03 * embedding[1].ptp()) plt.savefig('/tmp/relation.png', dpi=1000)
def ClusterAnalyses(dfs, names): close_prices = np.vstack([q['Close'] for q in dfs]) open_prices = np.vstack([q['Open'] for q in dfs]) variation = close_prices - open_prices # ######################## # Learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) # ######################### # Cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() for i in range(n_labels + 1): print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) # ######################### # Find a low-dimension embedding for visualization: find the best position of # the nodes (the stocks) on a 2D plane # We use a dense eigen_solver to achieve reproducibility (arpack is # initiated with random vectors that we don't control). In addition, we # use a large number of neighbors to capture the large-scale structure. node_position_model = manifold.LocallyLinearEmbedding( n_components=2, eigen_solver='dense', n_neighbors=6) embedding = node_position_model.fit_transform(X.T).T # ######################### # Visualization plt.figure(1, facecolor='w', figsize=(10, 8)) plt.clf() ax = plt.axes([0., 0., 1., 1.]) plt.axis('off') # Display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) # Plot the nodes using the coordinates of our embedding plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels, cmap=plt.cm.spectral) # Plot the edges start_idx, end_idx = np.where(non_zero) # a sequence of (*line0*, *line1*, *line2*), where:: # linen = (x0, y0), (x1, y1), ... (xm, ym) segments = [[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)] values = np.abs(partial_correlations[non_zero]) lc = LineCollection(segments, zorder=0, cmap=plt.cm.viridis, norm=plt.Normalize(0, .7 * values.max())) lc.set_array(values) lc.set_linewidths(15 * values) ax.add_collection(lc) # Add a label to each node. The challenge here is that we want to # position the labels to avoid overlap with other labels for index, (name, label, (x, y)) in enumerate( zip(names, labels, embedding.T)): dx = x - embedding[0] dx[index] = 1 dy = y - embedding[1] dy[index] = 1 this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: horizontalalignment = 'left' x = x + .002 else: horizontalalignment = 'right' x = x - .002 if this_dy > 0: verticalalignment = 'bottom' y = y + .002 else: verticalalignment = 'top' y = y - .002 plt.text(x, y, name, size=10, horizontalalignment=horizontalalignment, verticalalignment=verticalalignment, bbox=dict(facecolor='w', edgecolor=plt.cm.spectral(label / float(n_labels)), alpha=.6)) plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(), embedding[0].max() + .10 * embedding[0].ptp(),) plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(), embedding[1].max() + .03 * embedding[1].ptp()) plt.show()
def plot_graph(settings=None, macro_data_z=None, negate_fields=None): symbols = np.array(settings['data_fieldnames']).T graph_data = macro_data_z[macro_data_z.index > settings['common_start_date'] ][settings['data_fields']].iloc[2:] if negate_fields is not None: graph_data[negate_fields] = -graph_data[negate_fields] graph_data = graph_data.rolling(window=3, center=False).sum() variation = graph_data.values.T ############################################################################### # Learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) ############################################################################### # Cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() for i in range(n_labels + 1): print('Cluster %i: %s' % ((i + 1), ', '.join(symbols[labels == i]))) ############################################################################### # Find a low-dimension embedding for visualization: find the best position of # the nodes (the stocks) on a 2D plane from sklearn.decomposition import kernel_pca # node_position_model = manifold.LocallyLinearEmbedding( # n_components=2, eigen_solver='dense', n_neighbors=8) # node_position_model = KernelPCA(kernel='rbf', # fit_inverse_transform=True, # gamma=10, # n_components=2) node_position_model = manifold.SpectralEmbedding(n_components=2, n_neighbors=6) # node_position_model = PCA(n_components=2) embedding = node_position_model.fit_transform(X.T).T # embedding = components[[0, 1]].values.T f1 = 0 f2 = 1 ############################################################################### # Visualization plt.figure(1, facecolor='w', figsize=(12, 6)) plt.clf() ax = plt.axes([0., 0., 1., 1.]) # plt.axis('off') # ax.set_axis_bgcolor('k') # Display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) # Plot the nodes using the coordinates of our embedding plt.scatter(embedding[f1], embedding[f2], s=100 * d ** 2, c=labels, cmap=plt.cm.coolwarm) # Plot the edges start_idx, end_idx = np.where(non_zero) segments = [[embedding[[f1, f2], start], embedding[[f1, f2], stop]] for start, stop in zip(start_idx, end_idx)] values = np.abs(partial_correlations[non_zero]) lc = LineCollection(segments, zorder=0, cmap=plt.cm.coolwarm, norm=plt.Normalize(0, .7 * np.sqrt(values.max()))) lc.set_array(np.sqrt(values)) lc.set_linewidths(15 * np.sqrt(values)) ax.add_collection(lc) # Add a label to each node. The challenge here is that we want to # position the labels to avoid overlap with other labels label_offset = 0.002 for index, (name, label, (f_1, f_2)) in enumerate( zip(symbols, labels, embedding.T)): if f1 == 0: x = f_1 if f1 == 1: x = f_2 if f2 == 0: y = f_1 if f2 == 1: y = f_2 dx = x - embedding[f1] dx[index] = 1 dy = y - embedding[f2] dy[index] = 1 this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: horizontalalignment = 'left' x += label_offset else: horizontalalignment = 'right' x -= label_offset if this_dy > 0: verticalalignment = 'bottom' y += label_offset else: verticalalignment = 'top' y -= label_offset plt.text(x, y, name, size=10, horizontalalignment=horizontalalignment, verticalalignment=verticalalignment, bbox=dict(facecolor='w', edgecolor=plt.cm.spectral(label / float(n_labels)), alpha=.6)) plt.xlim(embedding[f1].min() - .15 * embedding[f1].ptp(), embedding[f1].max() + .10 * embedding[f1].ptp(),) plt.ylim(embedding[f2].min() - .03 * embedding[f2].ptp(), embedding[f2].max() + .03 * embedding[f2].ptp()) plt.show() plt.savefig('figures/macro_graph.png', facecolor='w', edgecolor='w', transparent=True)
def kluster(form): try: tickerA = web.DataReader(form.tickerA + '.sa', data_source='yahoo')[-252:] tickerB = web.DataReader(form.tickerB + '.sa', data_source='yahoo')[-252:] tickerC = web.DataReader(form.tickerC + '.sa', data_source='yahoo')[-252:] tickerD = web.DataReader(form.tickerD + '.sa', data_source='yahoo')[-252:] tickerE = web.DataReader(form.tickerE + '.sa', data_source='yahoo')[-252:] barchart = [tickerA, tickerB, tickerC, tickerD, tickerE] names = [ form.tickerA, form.tickerB, form.tickerC, form.tickerD, form.tickerE ] quotes = [] for item in barchart: portfolio = pd.DataFrame(item) quotes.append(portfolio) names = pd.DataFrame(names).T opening_quotes = np.array([quote.Open for quote in quotes]).astype(np.float) closing_quotes = np.array([quote.Close for quote in quotes]).astype(np.float) delta_quotes = closing_quotes - opening_quotes edge_model = covariance.GraphLassoCV() X = delta_quotes.copy().T X /= X.std(axis=0) with np.errstate(invalid='ignore'): edge_model.fit(X) from sklearn import cluster _, labels = cluster.affinity_propagation(edge_model.covariance_) num_labels = labels.max() k = [] for i in range(num_labels + 1): try: cluster = (i + 1, ', '.join(names.T[0][labels == i])) k.append(cluster) except Exception: pass # or you could use 'continue' kluster = pd.DataFrame(list(k)) kluster.columns = ['Cluster', 'Ticker'] kluster = kluster.to_html(index=False, columns=['Cluster', 'Ticker']) except Exception: return render_to_response('project/apologies.html') return render_to_response('cluster.html', context={'kluster': kluster})
def getStockMarketStructure(symbol_dict): # Choose a time period reasonnably calm (not too long ago so that we get # high-tech firms, and before the 2008 crash) d1 = datetime.datetime(2009, 1, 1) d2 = datetime.datetime(2011, 1, 1) #d1 = datetime.datetime.now() - timedelta(days=365*2) #d2 = datetime.datetime.now()- timedelta(days=1) # kraft symbol has now changed from KFT to MDLZ in yahoo symbols, names = np.array(list(symbol_dict.items())).T quotes = [finance.quotes_historical_yahoo(symbol, d1, d2, asobject=True) for symbol in symbols] open = np.array([q.open for q in quotes]).astype(np.float) close = np.array([q.close for q in quotes]).astype(np.float) # The daily variations of the quotes are what carry most information variation = close - open ############################################################################### # Learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) ############################################################################### # Cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() for i in range(n_labels + 1): print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) ############################################################################### # Find a low-dimension embedding for visualization: find the best position of # the nodes (the stocks) on a 2D plane # We use a dense eigen_solver to achieve reproducibility (arpack is # initiated with random vectors that we don't control). In addition, we # use a large number of neighbors to capture the large-scale structure. node_position_model = manifold.LocallyLinearEmbedding( n_components=2, eigen_solver='dense', n_neighbors=6) embedding = node_position_model.fit_transform(X.T).T ############################################################################### # Visualization plt.figure(1, facecolor='w', figsize=(10, 8)) plt.clf() ax = plt.axes([0., 0., 1., 1.]) plt.axis('off') # Display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) # Plot the nodes using the coordinates of our embedding plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels, cmap=plt.cm.spectral) # Plot the edges start_idx, end_idx = np.where(non_zero) #a sequence of (*line0*, *line1*, *line2*), where:: # linen = (x0, y0), (x1, y1), ... (xm, ym) segments = [[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)] values = np.abs(partial_correlations[non_zero]) lc = LineCollection(segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, .7 * values.max())) lc.set_array(values) lc.set_linewidths(15 * values) ax.add_collection(lc) # Add a label to each node. The challenge here is that we want to # position the labels to avoid overlap with other labels for index, (name, label, (x, y)) in enumerate( zip(names, labels, embedding.T)): dx = x - embedding[0] dx[index] = 1 dy = y - embedding[1] dy[index] = 1 this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: horizontalalignment = 'left' x = x + .002 else: horizontalalignment = 'right' x = x - .002 if this_dy > 0: verticalalignment = 'bottom' y = y + .002 else: verticalalignment = 'top' y = y - .002 plt.text(x, y, name, size=10, horizontalalignment=horizontalalignment, verticalalignment=verticalalignment, bbox=dict(facecolor='w', edgecolor=plt.cm.spectral(label / float(n_labels)), alpha=.6)) plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(), embedding[0].max() + .10 * embedding[0].ptp(),) plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(), embedding[1].max() + .03 * embedding[1].ptp()) #plt.show() filename_1 = id_generator()+'.svg' plt.savefig(filename_1) return filename_1
print("-- Loading raw data ({0:d}) and masking ...".format(subject_n)) regions_img = nisl.datasets.load_harvard_oxford("cort-maxprob-thr25-2mm", symmetric_split=True) print("-- Computing confounds ...") # Compcor on full image hv_confounds = nisl.image.high_variance_confounds(filename) mvt_confounds = np.loadtxt(confound_file, skiprows=1) confounds = np.hstack((hv_confounds, mvt_confounds)) print("-- Computing region signals ...") region_ts, _ = nisl.region.img_to_signals_labels(filename, regions_img) region_ts = nisl.signal.clean(region_ts, low_pass=None, detrend=True, standardize=True, confounds=confounds, t_r=2.5, high_pass=0.01) print("-- Computing covariance matrices ...") estimator = covariance.GraphLassoCV() estimator.fit(region_ts) plot_matrices(estimator.covariance_, -estimator.precision_, title="Graph Lasso CV ({0:.3f})".format(estimator.alpha_), subject_n=subject_n) pl.show()
def stock_structure_demo(): start_date = datetime(2005, 1, 1).date() end_date = datetime(2008, 1, 1).date() symbol_dict = { 'NYSE:TOT': 'Total', 'NYSE:XOM': 'Exxon', 'NYSE:CVX': 'Chevron', 'NYSE:COP': 'ConocoPhillips', 'NYSE:VLO': 'Valero Energy', 'NASDAQ:MSFT': 'Microsoft', 'NYSE:IBM': 'IBM', 'NYSE:TWX': 'Time Warner', 'NASDAQ:CMCSA': 'Comcast', 'NYSE:CVC': 'Cablevision', 'NASDAQ:YHOO': 'Yahoo', 'NASDAQ:DELL': 'Dell', 'NYSE:HPQ': 'HP', 'NASDAQ:AMZN': 'Amazon', 'NYSE:TM': 'Toyota', 'NYSE:CAJ': 'Canon', 'NYSE:SNE': 'Sony', 'NYSE:F': 'Ford', 'NYSE:HMC': 'Honda', 'NYSE:NAV': 'Navistar', 'NYSE:NOC': 'Northrop Grumman', 'NYSE:BA': 'Boeing', 'NYSE:KO': 'Coca Cola', 'NYSE:MMM': '3M', 'NYSE:MCD': 'McDonald\'s', 'NYSE:PEP': 'Pepsi', 'NYSE:K': 'Kellogg', 'NYSE:UN': 'Unilever', 'NASDAQ:MAR': 'Marriott', 'NYSE:PG': 'Procter Gamble', 'NYSE:CL': 'Colgate-Palmolive', 'NYSE:GE': 'General Electrics', 'NYSE:WFC': 'Wells Fargo', 'NYSE:JPM': 'JPMorgan Chase', 'NYSE:AIG': 'AIG', 'NYSE:AXP': 'American express', 'NYSE:BAC': 'Bank of America', 'NYSE:GS': 'Goldman Sachs', 'NASDAQ:AAPL': 'Apple', 'NYSE:SAP': 'SAP', 'NASDAQ:CSCO': 'Cisco', 'NASDAQ:TXN': 'Texas Instruments', 'NYSE:XRX': 'Xerox', 'NYSE:WMT': 'Wal-Mart', 'NYSE:HD': 'Home Depot', 'NYSE:GSK': 'GlaxoSmithKline', 'NYSE:PFE': 'Pfizer', 'NYSE:SNY': 'Sanofi-Aventis', 'NYSE:NVS': 'Novartis', 'NYSE:KMB': 'Kimberly-Clark', 'NYSE:R': 'Ryder', 'NYSE:GD': 'General Dynamics', 'NYSE:RTN': 'Raytheon', 'NYSE:CVS': 'CVS', 'NYSE:CAT': 'Caterpillar', 'NYSE:DD': 'DuPont de Nemours', 'NYSE:ABB': 'ABB' } symbols, names = np.array(sorted(symbol_dict.items())).T # retry is used because quotes_historical_google can temporarily fail # for various reasons (e.g. empty result from Google API). quotes = [] for symbol in symbols: print('Fetching quote history for %r' % symbol, file=sys.stderr) quotes.append( retry(quotes_historical_google)(symbol, start_date, end_date)) close_prices = np.vstack([q['close'] for q in quotes]) open_prices = np.vstack([q['open'] for q in quotes]) # The daily variations of the quotes are what carry most information variation = close_prices - open_prices # ############################################################################# # Learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) # ############################################################################# # Cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() for i in range(n_labels + 1): print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) # ############################################################################# # Find a low-dimension embedding for visualization: find the best position of # the nodes (the stocks) on a 2D plane # We use a dense eigen_solver to achieve reproducibility (arpack is # initiated with random vectors that we don't control). In addition, we # use a large number of neighbors to capture the large-scale structure. node_position_model = manifold.LocallyLinearEmbedding(n_components=2, eigen_solver='dense', n_neighbors=6) embedding = node_position_model.fit_transform(X.T).T # ############################################################################# # Visualization plt.figure(1, facecolor='w', figsize=(10, 8)) plt.clf() ax = plt.axes([0., 0., 1., 1.]) plt.axis('off') # Display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) # Plot the nodes using the coordinates of our embedding plt.scatter(embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=plt.cm.spectral) # Plot the edges start_idx, end_idx = np.where(non_zero) # a sequence of (*line0*, *line1*, *line2*), where:: # linen = (x0, y0), (x1, y1), ... (xm, ym) segments = [[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)] values = np.abs(partial_correlations[non_zero]) lc = LineCollection(segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, .7 * values.max())) lc.set_array(values) lc.set_linewidths(15 * values) ax.add_collection(lc) # Add a label to each node. The challenge here is that we want to # position the labels to avoid overlap with other labels for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)): dx = x - embedding[0] dx[index] = 1 dy = y - embedding[1] dy[index] = 1 this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: horizontalalignment = 'left' x = x + .002 else: horizontalalignment = 'right' x = x - .002 if this_dy > 0: verticalalignment = 'bottom' y = y + .002 else: verticalalignment = 'top' y = y - .002 plt.text(x, y, name, size=10, horizontalalignment=horizontalalignment, verticalalignment=verticalalignment, bbox=dict(facecolor='w', edgecolor=plt.cm.spectral(label / float(n_labels)), alpha=.6)) plt.xlim( embedding[0].min() - .15 * embedding[0].ptp(), embedding[0].max() + .10 * embedding[0].ptp(), ) plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(), embedding[1].max() + .03 * embedding[1].ptp()) plt.show()
# try: df = ts.get_hist_data(hs300_code[i], start='2017-07-23', end='2018-07-23') #print(str(hs300_code[i])+':'+str(df.shape)) #print(df) if df.shape == (245, 13): df_hs300[str(hs300_code[i])] = df['price_change'] names.append(hs300_name[i]) # except: # print('出现未知错误') names = np.array(names) variation = np.array(df_hs300) #print(variation) #print(df_hs300) X = variation.copy() X /= X.std(axis=0) edge_model = covariance.GraphLassoCV() #构建稀疏协方差逆矩阵 edge_model.fit(X) _, labels = cluster.affinity_propagation(edge_model.covariance_) #进行聚类 n_labels = labels.max() for i in range(n_labels + 1): # print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) print('Cluster' + str(i + 1) + ','.join(names[labels == i])) node_position_model = manifold.LocallyLinearEmbedding(n_components=2, eigen_solver='dense', n_neighbors=6) embedding = node_position_model.fit_transform(X.T).T
standardize=True, memory=mem, memory_level=1, verbose=1) region_ts = masker.fit_transform( func_filename, confounds=[hv_confounds, confound_filename]) subjects.append(region_ts) # Computing group-sparse precision matrices ################################### print("-- Computing group-sparse precision matrices ...") from nilearn.group_sparse_covariance import GroupSparseCovarianceCV gsc = GroupSparseCovarianceCV(verbose=2, n_jobs=3) gsc.fit(subjects) print("-- Computing graph-lasso precision matrices ...") from sklearn import covariance gl = covariance.GraphLassoCV(n_jobs=3) gl.fit(subjects[plotted_subject]) # Displaying results ########################################################## print("-- Displaying results") title = "{0:d} GroupSparseCovariance $\\alpha={1:.2e}$".format( plotted_subject, gsc.alpha_) plot_matrices(gsc.covariances_[..., plotted_subject], gsc.precisions_[..., plotted_subject], title) title = "{0:d} GraphLasso $\\alpha={1:.2e}$".format(plotted_subject, gl.alpha_) plot_matrices(gl.covariance_, gl.precision_, title) plt.show()
def cluster_data(data): names = data.columns edge_model = covariance.GraphLassoCV() data = np.array(data) X = data.copy().T X /= X.std(axis=0) edge_model.fit(X) _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() for i in range(n_labels + 1): print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) #Visualization node_position_model = manifold.LocallyLinearEmbedding(n_components=2, eigen_solver='dense', n_neighbors=6) embedding = node_position_model.fit_transform(X.T).T plt.figure(1, facecolor='w', figsize=(10, 8)) plt.clf() ax = plt.axes([0., 0., 1., 1.]) plt.axis('off') # Display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) # Plot the nodes using the coordinates of our embedding plt.scatter(embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=plt.cm.spectral) # Plot the edges start_idx, end_idx = np.where(non_zero) #a sequence of (*line0*, *line1*, *line2*), where:: # linen = (x0, y0), (x1, y1), ... (xm, ym) segments = [[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)] values = np.abs(partial_correlations[non_zero]) lc = LineCollection(segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, .7 * values.max())) lc.set_array(values) lc.set_linewidths(15 * values) ax.add_collection(lc) # Add a label to each node. The challenge here is that we want to # position the labels to avoid overlap with other labels for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)): name = str(name).decode('utf-8').encode('utf-8') dx = x - embedding[0] dx[index] = 1 dy = y - embedding[1] dy[index] = 1 this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: horizontalalignment = 'left' x = x + .002 else: horizontalalignment = 'right' x = x - .002 if this_dy > 0: verticalalignment = 'bottom' y = y + .002 else: verticalalignment = 'top' y = y - .002 plt.text(x, y, name, size=10, horizontalalignment=horizontalalignment, verticalalignment=verticalalignment, bbox=dict(facecolor='w', edgecolor=plt.cm.spectral(label / float(n_labels)), alpha=.6)) plt.xlim( embedding[0].min() - .15 * embedding[0].ptp(), embedding[0].max() + .10 * embedding[0].ptp(), ) plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(), embedding[1].max() + .03 * embedding[1].ptp()) plt.show()
np.matrix(np.eye(class_ix * 5))) X = np.random.multivariate_normal(mean=np.zeros(p), cov=C, size=ni) #ml_glassocv.fit(X) #Theta = ml_glassocv.get_precision() A_c.append(A) C_c.append(C) X_c.append(X) #Theta_t.append(Theta) A_list.append(A_c) C_list.append(C_c) X_list.append(X_c) #Theta_glassocv_list.append(Theta_t) #------------------------------------------------------------------------------------------------------------------------------------------------- #-------------------------------- Graphical Lasso --------------------------------- ml_glassocv = cov.GraphLassoCV(assume_centered=False) Theta_glassocv_list = [] for class_ix in range(len_class): Theta_c = [] for time_ix in range(len_t): ml_glassocv.fit(X_list[class_ix][time_ix]) Theta = ml_glassocv.get_precision() Theta_c.append(Theta) Theta_glassocv_list.append(Theta_c) # F1 score for graphical lasso for class_ix in range(len_class): for time_ix in range(len_t): getF1(A_list[class_ix][time_ix], Theta_glassocv_list[class_ix][time_ix]) # print(getF1(A_list[class_ix][time_ix], Theta_glassocv_list[class_ix][time_ix]))
# Computing some confounds hv_confounds = mem.cache(image.high_variance_confounds)(func_filename) region_ts = masker.transform(func_filename, confounds=[hv_confounds, confound_filename]) subject_time_series.append(region_ts) ############################################################################## # Computing group-sparse precision matrices from nilearn.connectome import GroupSparseCovarianceCV gsc = GroupSparseCovarianceCV(verbose=2) gsc.fit(subject_time_series) from sklearn import covariance gl = covariance.GraphLassoCV(verbose=2) gl.fit(np.concatenate(subject_time_series)) ############################################################################## # Displaying results atlas_imgs = image.iter_img(msdl_atlas_dataset.maps) atlas_region_coords = [plotting.find_xyz_cut_coords(img) for img in atlas_imgs] plotting.plot_connectome(gl.covariance_, atlas_region_coords, edge_threshold='90%', title="Covariance", display_mode="lzr") plotting.plot_connectome(-gl.precision_, atlas_region_coords, edge_threshold='90%',
def process(start_date, end_date, interesting_stock): quotes = [] # print("Downloading data...") for symbol in symbols: quotes.append(quotes_historical_yahoo_ochl(symbol, start_date, end_date, asobject=True)) if min([len(q.open) for q in quotes]) != max([len(q.open) for q in quotes]): for q, name in zip(quotes, names): print("%d: %s" % (len(q.open), name)) print("Different length of quotes for different stocks") open = np.array([q.open for q in quotes]).astype(np.float) close = np.array([q.close for q in quotes]).astype(np.float) # The daily variations of the quotes are what carry most information variation = close - open # print("Finding similarities...") edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() for i in range(n_labels + 1): cluster_desc = 'Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])) if interesting_stock is None or interesting_stock in cluster_desc: print(cluster_desc) def plot(): # We use a dense eigen_solver to achieve reproducibility (arpack is # initiated with random vectors that we don't control). In addition, we # use a large number of neighbors to capture the large-scale structure. node_position_model = manifold.LocallyLinearEmbedding( n_components=2, eigen_solver='dense', n_neighbors=6) embedding = node_position_model.fit_transform(X.T).T plt.figure(1, facecolor='w', figsize=(10, 8)) plt.clf() ax = plt.axes([0., 0., 1., 1.]) plt.axis('off') # Display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) # Plot the nodes using the coordinates of our embedding plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels, cmap=plt.cm.spectral) # Plot the edges start_idx, end_idx = np.where(non_zero) #a sequence of (*line0*, *line1*, *line2*), where:: # linen = (x0, y0), (x1, y1), ... (xm, ym) segments = [[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)] values = np.abs(partial_correlations[non_zero]) lc = LineCollection(segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, .7 * values.max())) lc.set_array(values) lc.set_linewidths(15 * values) ax.add_collection(lc) # Add a label to each node. The challenge here is that we want to # position the labels to avoid overlap with other labels for index, (name, label, (x, y)) in enumerate( zip(names, labels, embedding.T)): dx = x - embedding[0] dx[index] = 1 dy = y - embedding[1] dy[index] = 1 this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: horizontalalignment = 'left' x = x + .002 else: horizontalalignment = 'right' x = x - .002 if this_dy > 0: verticalalignment = 'bottom' y = y + .002 else: verticalalignment = 'top' y = y - .002 plt.text(x, y, name, size=10, horizontalalignment=horizontalalignment, verticalalignment=verticalalignment, bbox=dict(facecolor='w', edgecolor=plt.cm.spectral(label / float(n_labels)), alpha=.6)) plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(), embedding[0].max() + .10 * embedding[0].ptp(),) plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(), embedding[1].max() + .03 * embedding[1].ptp()) plt.show() plot()
print('Fetching station: {}, with error variance: {}'.format( stn, metric.var())) values.append(metric.squeeze()) # ========================================================================= # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery stn_id = np.array(stn_id) X = np.stack(values, axis=-1) X /= X.std(axis=0) print(np.shape(X)) # Learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV(n_refinements=10, cv=5) edge_model.fit(X) covariance = edge_model.covariance_ partial_correlations = edge_model.precision_ print(27 * '=', 'covariance matrix', 27 * '=') print(covariance) print(73 * '=') # ========================================================================= # Cluster using affinity propagation / Kmeans methods # unknown number of clusters: estimator = AffinityPropagation(verbose=True) # known number of clusters:
def affinity_propagation_network(X, names=None): """ Cluster (affinity propagation based on the correlation of ) rows of X, printing out cluster contents and drawing a labeled network of the results, with darker edges for more correlated pairs X can be an array or a pandas DataFrame. names are labels for the rows, which will be taken to be the indices of the dataframe, or the "names" column, or 0..n-1 otherwise Very lightly adapted from http://scikit-learn.org/stable/auto_examples/applications/plot_stock_market.html#example-applications-plot-stock-market-py Author: Gael Varoquaux [email protected] License: BSD 3 clause The output of the 3 models are combined in a 2D graph where nodes represents the columns and edges the: * cluster labels are used to define the color of the nodes * the sparse covariance model is used to display the strength of the edges * the 2D embedding is used to position the nodes in the plan This example has a fair amount of visualization-related code, as visualization is crucial here to display the graph. One of the challenge is to position the labels minimizing overlap. For this we use an heuristic based on the direction of the nearest neighbor along each axis """ X = X.copy() if isinstance(X, pd.DataFrame): if isinstance(names, basestring): names = X.pop(names) elif names is None: names = X.index.values X = X.as_matrix().T elif names is None: names = range(X.shape[0]) ############################################################################### # Learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance # is more efficient for structure recovery # X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) ############################################################################### # Cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() for i in range(n_labels + 1): print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) ############################################################################### # Find a low-dimension embedding for visualization: find the best position of # the nodes (the stocks) on a 2D plane # We use a dense eigen_solver to achieve reproducibility (arpack is # initiated with random vectors that we don't control). In addition, we # use a large number of neighbors to capture the large-scale structure. node_position_model = manifold.LocallyLinearEmbedding( n_components=2, eigen_solver='dense', n_neighbors=6) embedding = node_position_model.fit_transform(X.T).T ############################################################################### # Visualization plt.figure(1, facecolor='w', figsize=(10, 8)) plt.clf() ax = plt.axes([0., 0., 1., 1.]) plt.axis('off') # Display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) # Plot the nodes using the coordinates of our embedding plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels, cmap=plt.cm.spectral) # Plot the edges start_idx, end_idx = np.where(non_zero) #a sequence of (*line0*, *line1*, *line2*), where:: # linen = (x0, y0), (x1, y1), ... (xm, ym) segments = [[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)] values = np.abs(partial_correlations[non_zero]) lc = LineCollection(segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, .7 * values.max())) lc.set_array(values) lc.set_linewidths(15 * values) ax.add_collection(lc) # Add a label to each node. The challenge here is that we want to # position the labels to avoid overlap with other labels for index, (name, label, (x, y)) in enumerate( zip(names, labels, embedding.T)): dx = x - embedding[0] dx[index] = 1 dy = y - embedding[1] dy[index] = 1 this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: horizontalalignment = 'left' x = x + .002 else: horizontalalignment = 'right' x = x - .002 if this_dy > 0: verticalalignment = 'bottom' y = y + .002 else: verticalalignment = 'top' y = y - .002 plt.text(x, y, name, size=10, horizontalalignment=horizontalalignment, verticalalignment=verticalalignment, bbox=dict(facecolor='w', edgecolor=plt.cm.spectral(label / float(n_labels)), alpha=.6)) plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(), embedding[0].max() + .10 * embedding[0].ptp(),) plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(), embedding[1].max() + .03 * embedding[1].ptp()) plt.show()