コード例 #1
0
def learn_number_clusters(ds, method='ap'):
    if method=='ap':
        from sklearn import cluster, covariance
#         _, labels = cluster.affinity_propagation(ds.M)
#         return labels
#     elif method=='ap2':
        edge_model = covariance.GraphLassoCV(verbose=True)
        # standardize the time series: using correlations rather than covariance
        # is more efficient for structure recovery
        X = ds.M.values.copy().T
        X /= X.std(axis=0)
        print '--- B'
        edge_model.fit(X)
        _, labels = cluster.affinity_propagation(edge_model.covariance_)
        return labels

    elif method=='rbm':
        from sklearn import neural_network
        model = neural_network.BernoulliRBM(n_components=100,
                                            #random_state=0, 
                                            #n_iter=npasses,
                                            verbose=True
                                            )
        X = model.fit_transform(ds.M)
        return X
コード例 #2
0
def cluster_keywords(data,min_size=2, cluster_preference=True, verbose=True):
    start_cluster_idx = 0
    clusters = np.zeros(len(data['labels']), dtype=int)

    # create graph
    label_co_occ = data['co_occ']
    g = nx.from_numpy_matrix(label_co_occ)

    if(cluster_preference):
        # define preferences
        net_sizes = np.array([np.sqrt(1+m) for m in data['label_expenses']])
        M = np.percentile(label_co_occ,75.)
        m = np.min(label_co_occ)
        sizeM = np.percentile(net_sizes,75.)
        preference = m + np.asarray([min(s,sizeM) for s in net_sizes])*((M-m)/sizeM)

    for comp in sorted(nx.connected_components(g), key=len, reverse=True):
        l = len(comp)
        if(l >= min_size):
            temp_co_occ = (label_co_occ[:,comp])[comp,:]
            if(cluster_preference):
                [n_clusters, temp_clusters] = affinity_propagation(temp_co_occ,
                                                                    preference=preference[comp],
                                                                    max_iter=500,
                                                                    convergence_iter=40)
            else:
                [n_clusters, temp_clusters] = affinity_propagation(temp_co_occ,
                                                                    max_iter=500,
                                                                    convergence_iter=40)
            for i in xrange(l):
                clusters[comp[i]] = temp_clusters[i] + start_cluster_idx
            start_cluster_idx += len(n_clusters)
            if(verbose):
                print('Found component of size ' + str(l) + ' and added ' \
                        + str(len(n_clusters)) + ' clusters')
        else:
            # only one cluster for this component
            if(verbose):
                print('Found component of size ' + str(l) + ' so do not run affinity_propagation')
            for n in comp:
                clusters[n] = start_cluster_idx            
            start_cluster_idx += 1

    return g, clusters
コード例 #3
0
ファイル: b.py プロジェクト: DvHuang/Wifi_Clust
def clust(vectorfile,matrixfile,clusted):

    fid2fname = {}
    for line in open(vectorfile) :
        line = line.strip().split('\t')
        fid2fname.setdefault(int(line[0]), line[1:])

    N = len(fid2fname)
    rowlist = []
    collist = []
    datalist = []
    for line in open(matrixfile) :
        line = line.strip().split('\t')
        if len(line) < 3 : continue
        f1, f2, sim = line[:3]
        rowlist.append(int(f1))
        collist.append(int(f2))
        datalist.append(float(sim))

    for id in fid2fname :
        rowlist.append(int(id))
        collist.append(int(id))
        datalist.append(1.0)

    row = np.array(rowlist)
    col = np.array(collist)
    data = np.array(datalist)
    graph = coo_matrix((data, (row, col)), shape=(N, N))

    ###############################################################################

    # Force the solver to be arpack, since amg is numerically
    # unstable on this example
    # labels = spectral_clustering(graph, n_clusters=160, eigen_solver='arpack')

    _, labels = cluster.affinity_propagation(graph)
    n_labels = labels.max()
    print ("nlabels:",n_labels)

    # for i in range(n_labels + 1):
    #     print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

    cluster2fid = {}
    for index, lab in enumerate(labels) :
        cluster2fid.setdefault(lab, [])
        cluster2fid[lab].append(index)

    normal_data = open("normal-data.txt", 'w')
    easy_data=open("easy-data-500.txt", 'w')
    for index, lab in enumerate(cluster2fid) :
        for fid in cluster2fid[lab] :
            strx=""
            for i in range(0, len(fid2fname[fid])):
                strx+=str(fid2fname[fid][i])+"\t"
            print >> normal_data,strx+'\t'+str(index)
            print >> easy_data,strx+'\t'+str(fid)+'\t'+str(index)
コード例 #4
0
def affinityprop(correlations,names):
    n_clusters=0
    a,labels = cluster.affinity_propagation(correlations)
    #print labels
    print "Affinity Propagation Clusters"
    for i in range(labels.max()+1):
        print 'Cluster %i: %s' % ((i+1),
                              ', '.join(names[labels==i]))
        if len(names[labels==i]) > 1:
            n_clusters+=1
    print "Number of Clusters with more than 1 element: " + str(n_clusters)
    return n_clusters
コード例 #5
0
ファイル: test_cluster.py プロジェクト: Sandy4321/pandas-ml
    def test_affinity_propagation(self):
        iris = datasets.load_iris()
        similality = np.cov(iris.data)
        df = pdml.ModelFrame(similality)

        result = df.cluster.affinity_propagation()
        expected = cluster.affinity_propagation(similality)

        self.assertEqual(len(result), 2)
        self.assert_numpy_array_almost_equal(result[0], expected[0])

        self.assertTrue(isinstance(result[1], pdml.ModelSeries))
        self.assert_index_equal(result[1].index, df.index)
        self.assert_numpy_array_equal(result[1].values, expected[1])
コード例 #6
0
def discover_clusters(var):
    from sklearn import cluster, covariance
    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()
    edge_model.fit(var)
    
    # Cluster using affinity propagation
    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()
    for i in xrange(n_labels + 1):
        print 'Cluster %i: %s' % (i, \
            ', '.join(var.columns[labels == i]))
    del cluster, covariance
    
    return labels, edge_model.precision_.copy()
コード例 #7
0
 def makeClusters(self, toks, cnts, rootName = 'ROOT',saveSimMat=None):
     '''
     '''
     S = self.w2v.getSimMat(toks)
     
     if(saveSimMat):
         file_iter=open(saveSimMat,"wb")
         logger=file_iter.write
         logger("word|%s\n"%("|".join(toks)))
         for ind,tok in enumerate(toks): 
             list_toks=[str(round(k,3)) for k in S[ind]]
             str_join="|".join((tok,"|".join((list_toks))))
             logger("%s\n"%str_join)
         file_iter.close()
     
     ctable = []
     n = len(toks)
     x = range(n)
     ntoks = np.array(toks)
     ncnts = np.array(cnts, dtype='float')
     ncnts = ncnts/ncnts.sum()    
     k = 0
     while (True):
         Sk = S[np.ix_(x,x)]
         ntoks = ntoks[x]
         ncnts = ncnts[x]    
         xk, labels = cluster.affinity_propagation(Sk) 
         n_labels = labels.max()
         for i in xrange(n_labels + 1):
             cidx = labels == i
             ctoks = ntoks[cidx]
             ccnts = ncnts[cidx]
             pidx = ccnts.argsort()[::-1][0]
             cname = ntoks[xk[i]] #cluster center
             clname = ctoks[pidx] #most frequent node in cluster
             #temp = {'LEVEL':k, 'CLUSTER': (i+1), 'CENTER': cname, 'NAME': ' '.join(clname[:-2].split('_NG_')), 'MEMBERS': ctoks}
             temp = {'LEVEL':k, 'CLUSTER': (i+1), 'CENTER': cname, 'NAME': ' '.join(cname[:-2].split('_NG_')), 'MEMBERS': ctoks}
             ctable.append(temp)
         k+=1
         #break
         x = xk
         if len(xk) <= 3:
             break
         
     self.ctable = ctable
     self.G = self.ctable2G_()
             
     return ctable
コード例 #8
0
ファイル: pack_cluster.py プロジェクト: TinyOS-Camp/DDEA-DEV
def cluster_measurement_points(m_matrix, m_name, corr_bnd = [0.1,0.9],alg='aff'):
    exemplars_dict = dict()

    if m_matrix.shape[1] == 0:
        return [], exemplars_dict, [], []

    elif m_matrix.shape[1] == 1:
        exemplars_ = [0]
        labels_= [0]
        exemplars_name = m_name

    else:
        distmat_input = find_norm_dist_matrix(m_matrix)

        # Find representative set of sensor measurements 
        min_dist_ = np.sqrt(2*(1-(corr_bnd[1])))
        max_dist_ = np.sqrt(2*(1-(corr_bnd[0])))

        if alg == 'pack':
            log.info('use pack clustering algoirthm')
            exemplars_, labels_ = max_pack_cluster(distmat_input, min_dist=min_dist_, max_dist=max_dist_)
        else:
            log.info('use affinity clustering algoirthm')
            SIMM_MAT = 2 - distmat_input
            exemplars_, labels_ = cluster.affinity_propagation(SIMM_MAT, damping=0.5)

        num_clusters = int(labels_.max()+1)
        log.info('-' * 40)
        log.info(str(num_clusters) + 'clusters out of ' + str(len(labels_)) + 'measurements')
        log.info('-' * 40)

        validity, intra_dist, inter_dist = compute_cluster_err(distmat_input, labels_)

        log.info('validity: ' + str(round(validity,2)) + ', intra_dist: ' +
                 str(np.round(intra_dist,2)) + ', inter_dist: ' +
                 str(np.round(inter_dist,2)))
        log.info('-' * 40)
        exemplars_name = list(np.array(m_name)[exemplars_])
    
    for label_id, (m_idx,exemplar_label) in enumerate(zip(exemplars_, exemplars_name)):
        log.info(str(exemplar_label))
        children_set = list(set(np.nonzero(labels_ == label_id)[0]) - set([m_idx]))
        log.info('Label ' + str(label_id) + ' : ' + str(m_idx) + '<--' + str(children_set) )
        exemplars_dict.update({exemplar_label : list(np.array(m_name)[children_set])})

    return m_matrix[:, exemplars_], exemplars_dict, exemplars_, labels_
コード例 #9
0
def cluster_affinity_propagation(data, memlim=10):
    """Cluster data (rows = coordinates; columns = observations)
    with the affinity propagation algorithm from scikit learn.
    memlim:  memory limit in GB
    """ 
    ndim, nsamples = data.shape
    if nsamples**2*8.0/1024**3 > memlim:
        print("Distance matrix would be larger than {} GB. Aborting.".format(memlim))
        sys.exit(1)

    # compute distance matrix
    dist = np.zeros([nsamples, nsamples])
    for i in range(nsamples):
        dist[:,i] = compute_distances(data, i)
        dist[i,:] = dist[:,i]

    centers, featureTrj = skc.affinity_propagation(dist, copy=False, damping=0.5)
    nclusters = centers.shape[0]
    populations = Counter(featureTrj)
#    print(populations)
#    print(centers)
#    print(featureTrj)

    # sort clusters by size
    popsunsrt = np.array([int(i) for i in populations.values()]) / (1.0 * nsamples)
    indxunsrt = np.array(populations.keys())
    indxsrt   = np.argsort(popsunsrt)[::-1]
    popsrt    = popsunsrt[indxsrt]
    centersrt = centers[indxsrt]

    # renumber featureTrj
    featureTrjsrt = np.zeros_like(featureTrj, dtype=np.int)
    clusters = []
    for c in range(nclusters):
        where = featureTrj == c
        featureTrjsrt[where] = indxsrt[c]
        clusters.append(set(np.where(featureTrjsrt == c)[0]))

#    return popsrt, clusters, centersrt, featureTrjsrt
    return populations, clusters, centers, featureTrj
コード例 #10
0
def corr_cluster(corr_file, matrix_file, cluster_dir):
    corr_frame = pa.DataFrame.from_csv(corr_file).abs()
    #use the abs value of corr to cluster
    freq_matrix = pa.DataFrame.from_csv(matrix_file)

    _, labels = cluster.affinity_propagation(corr_frame)
    n_label = labels.max()
    names = corr_frame.index
    #output cluster file
    if corr_file[-1] == "/":
        cluster_file = corr_file.split("/")[-2] + "_cluster"
    else:
        cluster_file = corr_file.split("/")[-1] + "_cluster"

    cluster_file = codecs.open(os.path.join(cluster_dir, cluster_file),
                               "w",
                               encoding="utf-8")

    for i in range(n_label + 1):
        #compute the average correlation between cluster and out-cluster
        clus = np.array(names[labels == i])
        in_cluster = corr_frame[clus].ix[clus]
        up_index = np.triu_indices(len(clus), 1)
        aver_corr = np.array(in_cluster)[up_index].mean()

        out_clus = np.array(names[labels != i])
        out_cluster = corr_frame[clus].ix[out_clus]
        out_aver_corr = np.array(out_cluster).mean()

        #compute the variance of the entity
        in_aver_var = freq_matrix[clus].var(axis=1).mean()

        obj = {"cluster": map(str, clus), "in_aver_corr": aver_corr,
               "out_aver_corr": out_aver_corr,
               "in_aver_var": in_aver_var}
        cluster_file.write(json.dumps(obj, ensure_ascii=False) + "\n")

    cluster_file.flush()
    cluster_file.close()
コード例 #11
0
ファイル: pack_cluster.py プロジェクト: TinyOS-Camp/DDEA-DEV
def cluster_measurement_points(m_matrix,m_name,corr_bnd=[0.1,0.9],alg='aff'):
    exemplars_dict={}    
    if m_matrix.shape[1]==0:
        return [],exemplars_dict,[],[]
    elif m_matrix.shape[1]==1:
        exemplars_=[0]
        labels_=[0]
        exemplars_name=m_name
    else:
        distmat_input=find_norm_dist_matrix(m_matrix)
        # Find representative set of sensor measurements 
        min_dist_=np.sqrt(2*(1-(corr_bnd[1])))
        max_dist_=np.sqrt(2*(1-(corr_bnd[0])))
        if alg=='pack':
            print 'use pack clustering algoirthm'
            exemplars_,labels_=max_pack_cluster(distmat_input,min_dist=min_dist_,max_dist=max_dist_)
        else:
            print 'use affinity clustering algoirthm'
            SIMM_MAT=2-distmat_input
            exemplars_,labels_=cluster.affinity_propagation(SIMM_MAT,damping=0.5)

        
        num_clusters=int(labels_.max()+1)
        print '-------------------------------------------------------------------------'
        print num_clusters, 'clusters out of ', len(labels_), 'measurements'
        print '-------------------------------------------------------------------------'
        validity,intra_dist,inter_dist=compute_cluster_err(distmat_input,labels_)
        print 'validity:',round(validity,2),', intra_dist: ',np.round(intra_dist,2),', inter_dist: ',np.round(inter_dist,2)
        print '-------------------------------------------------------------------------'
        exemplars_name=list(np.array(m_name)[exemplars_])
    
    for label_id,(m_idx,exemplar_label) in enumerate(zip(exemplars_,exemplars_name)):
        print exemplar_label
        children_set=list(set(np.nonzero(labels_==label_id)[0])-set([m_idx]))
        print 'Label ', label_id, ': ',m_idx,'<--', children_set
        exemplars_dict.update({exemplar_label:list(np.array(m_name)[children_set])})
    return m_matrix[:,exemplars_], exemplars_dict,exemplars_,labels_
コード例 #12
0

# #############################################################################
# Learn a graphical structure from the correlations
edge_model = covariance.GraphicalLassoCV(cv=5)

# standardize the time series: using correlations rather than covariance
# is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
edge_model.fit(X)

# #############################################################################
# Cluster using affinity propagation

_, labels = cluster.affinity_propagation(edge_model.covariance_)
n_labels = labels.max()

for i in range(n_labels + 1):
    print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

# #############################################################################
# Find a low-dimension embedding for visualization: find the best position of
# the nodes (the stocks) on a 2D plane

# We use a dense eigen_solver to achieve reproducibility (arpack is
# initiated with random vectors that we don't control). In addition, we
# use a large number of neighbors to capture the large-scale structure.
node_position_model = manifold.LocallyLinearEmbedding(
    n_components=2, eigen_solver='dense', n_neighbors=6)
コード例 #13
0
def affProp(instance_path, res_folder, strategy = 2):
	instances = instance_path.rsplit('/', 1)[0] + '/'
	file = instance_path.rsplit('/', 1)[1]
	input_type  = '.' + file.rsplit('.', 1)[1]
	file = file.rsplit('.', 1)[0]
	data, row_names = parse.read(instances + file + input_type)
	print 'Size of data matrix: ', data.shape
	if len(data) <> len(row_names):
		print 'Af prop error: data and row_names have diff. lens', len(data), len(row_names)	
	#save_matrix_fig(data, res_folder, file+'_in')
	sim_matrix = []
	# 	
	try:
		sim_matrix = np.load(res_folder+file+'_sim'+str(strategy)+'.npy')
		print 'Sim matrix %s found.' %(res_folder+file+'_sim'+str(strategy)+'.npy')
	except:
		print 'Sim matrix %s NOT found!!!!' %(res_folder+file+'_sim'+str(strategy)+'.npy')
		sim_matrix = pp.strategy(data, 'sim',strategy)	
		np.save(res_folder+file+'_sim'+str(strategy), sim_matrix)

	old_n_clusters = 0
	old_non_clustered = 0

	# list to save labels from all iterations, so we can later pick the best clustering
	res_from_diff_params = {}
	nr_clusters_from_diff_params = {}
	non_clustered_from_diff_params = {}
	distribution_from_diff_params = {}
	best_iteration = -1
	sec_best_iteration = -1
	n = sim_matrix.shape[0]
	min_non_clusterd = n
	s_min_non_clusterd = n
	max_std_dev = n
	sec_threshold = 0.0001
	n_iterations = 20  		# must be an odd number 

	sim_matrix[sim_matrix == 0] = -1e10
	#min_preferance = 0
	#min_preferance *= np.max(sim_matrix[sim_matrix > 0])
	min_preferance = np.min(sim_matrix[sim_matrix > 0]) -10
	max_preferance = np.median(sim_matrix[sim_matrix > 0])
	print 'min_preferance, ', min_preferance
	print 'max_preferance, ', max_preferance
	
	if min_preferance > max_preferance:
		raise Exception('Something is wrong with preferance setting: %d %d', 
			min_preferance, max_preferance)
	elif min_preferance == max_preferance:
		n_iterations = 1
		pref_list = [min_preferance]
	
	pref_step = (max_preferance-min_preferance) / n_iterations

	# cluster the data with DBSCAN ---------------------------------------------
	for iteration in range(n_iterations):
		
		if iteration == 0:
			preference = min_preferance
		else:
			preference += pref_step
		labels = []
		print '_______________________________________________________'
		print 'Aff. Prop. with preferance =', preference
		
		_, labels = affinity_propagation(sim_matrix, preference=preference)
		n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
		
		num_per_cluster = {}
		for i in range(n_clusters):
			num_per_cluster[i] = 0

		for label in labels:
			for i in range(n_clusters):
				if label == i:
					num_per_cluster[i] += 1; 


		# TODO: criteria for skiping or breaking the loop ---------------------------------------------
		# skip the iteration if the number of clusters is as before
		if iteration == 0:
			old_n_clusters = n_clusters
		#elif n_clusters >= old_n_clusters:
		#	break
		old_n_clusters = n_clusters
		# increase the preferance 
		if n_clusters == 1:
			print 'DEBUG: Aff prop. n_clusters == 1, going to next iteration'
			min_preferance = preference
			max_preferance += (max_preferance - min_preferance) / 2
			pref_step = (max_preferance-min_preferance) / (n_iterations-iteration)
			print 'min = %f, max = %f, step = %f' %(min_preferance, max_preferance, pref_step)
			continue
		# lower the preferance	
		if n_clusters >= 0.1*n:
			print 'DEBUG: Aff prop. n_clusters = %i, TOO HIGH!!!' %n_clusters
			max_preferance = preference
			min_preferance = preference - pref_step
			pref_step = (max_preferance-min_preferance) / (n_iterations-iteration)
			print 'min = %f, max = %f, step = %f' %(min_preferance, max_preferance, pref_step)
			continue	
		# ---------------------------------------------------------------------------------------
		# display some information
		print 'Estimated number of clusters: ',  n_clusters		
		print 'Number of points per cluster: ', num_per_cluster
		#draw(A=sim_matrix, colors=labels)
		# ---------------------------------------------------------------------------------------
		sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names)
		#print 'DEBUG:'
		#print 'column_labels = ', column_labels
		#print 'sotred_labels = ', sotred_labels
		#save_matrix_fig(sorted_data, res_folder, file + '_B_dec' +  str(iteration))

		# pull down the points which have non-zero value that colides with points from other clusters
		sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, 
																		sotred_labels, sorted_names, column_labels)

		num_per_cluster = {}
		n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0)
		if -1 in sotred_labels2:
			all_clusters_list = range(-1, n_clusters)
		else:
			all_clusters_list = range(n_clusters)

		for i in all_clusters_list:
			num_per_cluster[i] = 0

		for label in sotred_labels2:
			for i in all_clusters_list:
				if label == i:
					num_per_cluster[i] += 1; 
		non_clustered = 0;			
		for label in sotred_labels2:
			if label == -1:
				non_clustered += 1	
		print 'Estimated number of clusters after removal: ',  n_clusters
		print 'Number of points per cluster after removal: ', num_per_cluster
		print 'Number of non clustered points after removal:', non_clustered
		if 0 in num_per_cluster.values():
			print 'TIME TO DEBUG:'
			print 'sotred_labels2 = ', sotred_labels2

		# save picture of end matrix
		#save_matrix_fig(sorted_data2, res_folder, file + '_A_dec' +  str(iteration))
		#if res2_folder <> 'none':
			#save_matrix_fig(sorted_data2, res2_folder, file + '_A_dec' +  str(iteration))
		# find the best iteration, so we only save the best one --------------------------
		label_name_pairs = zip(sotred_labels2, sorted_names2)
		if non_clustered < min_non_clusterd:
			res_from_diff_params[iteration] = label_name_pairs
			nr_clusters_from_diff_params[iteration] = n_clusters
			non_clustered_from_diff_params[iteration] = non_clustered
			distribution_from_diff_params[iteration] = num_per_cluster
			min_non_clusterd = non_clustered
			if n_clusters > 1:
				second_best = iteration
			best_iteration = iteration
			print 'this is best iteration currently'
		
		# find the best iteration (according variance of cluster sizes), ----------------
		# so we only save the best one 
		temp_num_per_cluster = num_per_cluster.copy()
		if -1 in temp_num_per_cluster.keys():
			del temp_num_per_cluster[-1]
		if len(temp_num_per_cluster.values()) > 1:
			std_dev = np.std(temp_num_per_cluster.values())	
			mean = np.mean(temp_num_per_cluster.values())	
			rel_std_dev = std_dev / mean
			rel_std_dev *= pow(non_clustered/n, 2)
			print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev
			std_dev = rel_std_dev
			# we accept the iteration if adjusted rel_std_dev is smaller, or 
			# if it is within the threshold and number of nonclustered points is smaller
			if (std_dev - max_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd:
				sec_criteria_fulfiled = True
			else:
				sec_criteria_fulfiled = False
			if std_dev < max_std_dev or sec_criteria_fulfiled:
				res_from_diff_params[iteration] = label_name_pairs
				nr_clusters_from_diff_params[iteration] = n_clusters
				non_clustered_from_diff_params[iteration] = non_clustered
				distribution_from_diff_params[iteration] = num_per_cluster
				max_std_dev = std_dev
				s_min_non_clusterd = non_clustered
				sec_best_iteration = iteration
				print 'this is second best iteration currently'		
		# ----------------------------------------------------------------------------------
		print '_______________________________________________________'
				

	best_found = False	
	best_n_clusters = 0
	best_non_clusterd = data.shape[0]
	best_distro = {-1:data.shape[0]}
	best_dec = ''	# name of dec file for best iteration

	s_best_found = False
	s_best_n_clusters = 0
	s_best_non_clusterd = data.shape[0]
	s_best_distro = {-1:data.shape[0]}
	s_dec = ''		# name of dec file for second best iteration

	# save .dec from best iteration
	print 'best_iteration= ', best_iteration
	print 'sec best iteration = ', sec_best_iteration
	if best_iteration >= 0:
		best_found = True
		best_n_clusters = nr_clusters_from_diff_params[best_iteration]
		best_non_clusterd = non_clustered_from_diff_params[best_iteration]
		best_distro = distribution_from_diff_params[best_iteration]
		best_dec = file + '_affProp_' + str(best_n_clusters) + '_' + str(best_non_clusterd)
		dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration)
	if sec_best_iteration >= 0:
		if sec_best_iteration <> best_iteration:
			s_best_found = True
		s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration]
		s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration]
		s_best_distro = distribution_from_diff_params[sec_best_iteration]
		s_dec = file + '_affPropSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd)
		dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration)	
	print '_______________________________________________________'
	print '_______________________________________________________'	
	gc.collect()
	return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \
			s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec
コード例 #14
0
ファイル: inv_cov.py プロジェクト: vsmolyakov/fin
    gl_prec = graph.precision_
    gl_alphas =graph.cv_alphas_
    gl_scores = np.mean(graph.grid_scores, axis=1)

    plt.figure()        
    sns.heatmap(gl_prec)
    
    plt.figure()    
    plt.plot(gl_alphas, gl_scores, marker='o', color='b', lw=2.0, label='GraphLassoCV')
    plt.title("Graph Lasso Alpha Selection")
    plt.xlabel("alpha")
    plt.ylabel("score")
    plt.legend()
    
    #cluster using affinity propagation
    _, labels = cluster.affinity_propagation(gl_cov)
    num_labels = np.max(labels)
    
    for i in range(num_labels+1):
        print("Cluster %i: %s" %((i+1), ', '.join(names[labels==i])))
    
    #find a low dim embedding for visualization
    node_model = manifold.LocallyLinearEmbedding(n_components=2, n_neighbors=6, eigen_solver='dense')
    embedding = node_model.fit_transform(X.T).T
    
    #generate plots
    plt.figure()
    plt.clf()
    ax = plt.axes([0.,0.,1.,1.])
    plt.axis('off')
    
コード例 #15
0
ファイル: stock_cluster.py プロジェクト: DonaldPG/PyTAAA
def dailyStockClusters():
    import datetime
    import os
    import numpy as np
    import pandas.io.data as web
    from pandas import DataFrame
    from matplotlib import pylab as pl
    from matplotlib import finance
    from matplotlib.collections import LineCollection
    
    from sklearn import cluster, covariance, manifold
    ########################################################################
    ###
    ### This example employs several unsupervised learning techniques to 
    ### extract the stock market structure from variations in historical quotes.
    ### The quantity that we use is the daily variation in quote price: 
    ### quotes that are linked tend to co-fluctuate during a day.
    ###
    ### stocks used are all Nasdaq 100 stocks that have one year of history
    ### from the current date.
    ###
    ### adopted from example at:
    ### http://scikit-learn.org/0.14/auto_examples/applications/plot_stock_market.html
    ###
    ########################################################################
    # Retrieve the data from Internet
    
    # Choose a time period reasonnably calm (not too long ago so that we get
    # high-tech firms, and before the 2008 crash)
    today = datetime.datetime.now()
    d1 = datetime.datetime(today.year-1, today.month, today.day)
    d2 = datetime.datetime(today.year, today.month, today.day)
    
    # input symbols and company names from text file
    companyName_file = os.path.join( os.getcwd(), "symbols",  "companyNames.txt" )
    with open( companyName_file, "r" ) as f:
        companyNames = f.read()
    
    print "\n\n\n"
    companyNames = companyNames.split("\n")
    ii = companyNames.index("")
    del companyNames[ii]
    companySymbolList  = []
    companyNameList = []
    symbol_dict = {}
    for iname,name in enumerate(companyNames):
        name = name.replace("amp;", "")
        testsymbol, testcompanyName = name.split(";")
        companySymbolList.append(format(testsymbol,'s'))
        companyNameList.append(format(testcompanyName,'s'))
        if testsymbol != "CASH":
            symbol_dict[ testsymbol ] = format(testcompanyName,'s')
    print " ... symbol_dict = ", symbol_dict
    
    
    symbols = companySymbolList[:]
    names = companyNameList[:]
    
                       
    all_data = {}
    for ticker in symbols:
        try:
            all_data[ticker] = web.get_data_yahoo(ticker, d1, d2)
            qclose = DataFrame({tic: data['Close']
                        for tic, data in all_data.iteritems()})
            qopen = DataFrame({tic: data['Open']
                        for tic, data in all_data.iteritems()})
        except:
            print "Cant find ", ticker
    
    symbols_edit = []
    names_edit = []
    for i, ticker in enumerate( symbols ):
        if True in np.isnan(np.array(qclose[ticker])).tolist():
            print ticker, " nans found, ticker removed"
            del qclose[ticker]
            del qopen[ticker]
        else:
            symbols_edit.append(ticker)
            names_edit.append( names[i] )
    
    # The daily variations of the quotes are what carry most information
    variation = qclose - qopen
    variation[ np.isnan(variation) ] = 0.
    
    
    ###############################################################################
    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()
    
    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery
    X = variation.copy()
    #X = variation.copy().T
    X /= X.std(axis=0)
    edge_model.fit(X)
    
    ###############################################################################
    # Cluster using affinity propagation
    
    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()
    
    for i in range(n_labels + 1):
        print "Cluster "+str(i)+":"
        for j in range(len(labels)):
            if labels[j] == i:
                print " ... "+names_edit[j]
        #print('Cluster %i: %s' % ((i + 1), ', '.join(names_edit[labels == i])))

    for i in range(n_labels + 1):
        print "Cluster "+str(i)+":"
        for j in range(len(labels)):
            if labels[j] == i:
                print " ... "+names_edit[j]
                
    figure7path = 'Clustered_companyNames.png'  # re-set to name without full path
    figure7_htmlText = "\n<br><h3>Daily stock clustering analyis. Based on one year performance correlations.</h3>\n"
    figure7_htmlText = figure7_htmlText + "\nClustering based on daily variation in Nasdaq 100 quotes.\n"
    figure7_htmlText = figure7_htmlText + '''<br><img src="'''+figure7path+'''" alt="PyTAAA by DonaldPG" width="850" height="500"><br>\n'''

        
    ###############################################################################
    # Find a low-dimension embedding for visualization: find the best position of
    # the nodes (the stocks) on a 2D plane
    
    # We use a dense eigen_solver to achieve reproducibility (arpack is
    # initiated with random vectors that we don't control). In addition, we
    # use a large number of neighbors to capture the large-scale structure.
    node_position_model = manifold.LocallyLinearEmbedding(
        n_components=2, eigen_solver='dense', n_neighbors=6)
    
    embedding = node_position_model.fit_transform(X.T).T
    
    ###############################################################################
    # Visualization
    pl.figure(1, facecolor='w', figsize=(10, 8))
    pl.clf()
    ax = pl.axes([0., 0., 1., 1.])
    pl.axis('off')
    
    # Display a graph of the partial correlations
    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
    
    # Plot the nodes using the coordinates of our embedding
    pl.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
               cmap=pl.cm.spectral)
    
    # Plot the edges
    start_idx, end_idx = np.where(non_zero)
    #a sequence of (*line0*, *line1*, *line2*), where::
    #            linen = (x0, y0), (x1, y1), ... (xm, ym)
    segments = [[embedding[:, start], embedding[:, stop]]
                for start, stop in zip(start_idx, end_idx)]
    values = np.abs(partial_correlations[non_zero])
    lc = LineCollection(segments,
                        zorder=0, cmap=pl.cm.hot_r,
                        norm=pl.Normalize(0, .7 * values.max()))
    lc.set_array(values)
    lc.set_linewidths(15 * values)
    ax.add_collection(lc)
    
    # Add a label to each node. The challenge here is that we want to
    # position the labels to avoid overlap with other labels
    for index, (name, label, (x, y)) in enumerate(
            zip(names, labels, embedding.T)):
    
        dx = x - embedding[0]
        dx[index] = 1
        dy = y - embedding[1]
        dy[index] = 1
        this_dx = dx[np.argmin(np.abs(dy))]
        this_dy = dy[np.argmin(np.abs(dx))]
        if this_dx > 0:
            horizontalalignment = 'left'
            x = x + .002
        else:
            horizontalalignment = 'right'
            x = x - .002
        if this_dy > 0:
            verticalalignment = 'bottom'
            y = y + .002
        else:
            verticalalignment = 'top'
            y = y - .002
        pl.text(x, y, name, size=10,
                horizontalalignment=horizontalalignment,
                verticalalignment=verticalalignment,
                bbox=dict(facecolor='w',
                          edgecolor=pl.cm.spectral(label / float(n_labels)),
                          alpha=.6))
    
    pl.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
            embedding[0].max() + .10 * embedding[0].ptp(),)
    pl.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
            embedding[1].max() + .03 * embedding[1].ptp())
    
    pl.savefig( os.path.join( os.getcwd(), "pyTAAA_web",  "Clustered_companyNames.png" ), format='png' )
    
    return figure7_htmlText
コード例 #16
0
ファイル: pack_cluster.py プロジェクト: TinyOS-Camp/DDEA-DEV
def CLUSTERING_TEST(distmat_input,min_corr=0.1,max_corr=0.9):
    ################################################################################
    # Unsupervised clustering for sensors given the normalized euclidian distance
    # of sensor data
    # Find only a few represetative sensors out of many sensors
    ################################################################################
    # exemplars are a set of representative signals for each cluster
    # Smaller dampding input will generate more clusers, default is 0.5
    # 0.5 <= damping <=0.99
    ################################################################################
    print '==========================================================='
    print 'Clustering Test'
    print '==========================================================='
    print 'Pack Clustering'
    print '---------------------------'
    min_dist_=np.sqrt(2*(1-(max_corr)))
    max_dist_=np.sqrt(2*(1-(min_corr)))
    pack_exemplars,pack_labels=max_pack_cluster(distmat_input,min_dist=min_dist_,max_dist=max_dist_)
    pack_num_clusters=int(pack_labels.max()+1)
    print '-------------------------------------------------------------------------'
    print pack_num_clusters, 'clusters out of ', len(pack_labels), 'measurements'
    print '-------------------------------------------------------------------------'
    validity,intra_dist,inter_dist=compute_cluster_err(distmat_input,pack_labels)
    print 'validity:',round(validity,2),', intra_dist: ',np.round(intra_dist,2),', inter_dist: ',np.round(inter_dist,2)
    print '-------------------------------------------------------------------------'
    
    
    max_num_clusters=pack_num_clusters   
    print 'Heirachical Clustering'
    print '---------------------------'
    ward_validity_log=[];
    ward_intra_dist_log=[];
    ward_inter_dist_log=[];
    ward_num_clusters_log=[]
    for k in range(2,max_num_clusters+1):
        start_time = time.time()
        ward = Ward(n_clusters=k).fit(distmat_input.T)
        exec_time=time.time() - start_time
        print exec_time, ' secs'
        ward_labels=ward.labels_
        ward_validity,ward_intra_dist,ward_inter_dist=compute_cluster_err(distmat_input,ward_labels)
        ward_num_clusters=int(ward_labels.max()+1)
        ward_validity_log.append(ward_validity);
        ward_intra_dist_log.append(list(ward_intra_dist));
        ward_inter_dist_log.append(list(ward_inter_dist));
        ward_num_clusters_log.append(ward_num_clusters)
    ward_intra_dist_log=np.array(ward_intra_dist_log);
    ward_inter_dist_log=np.array(ward_inter_dist_log)
    
    

    print 'K-Mean Clustering'
    print '---------------------------'
    kmean_validity_log=[];
    kmean_intra_dist_log=[];
    kmean_inter_dist_log=[];
    kmean_num_clusters_log=[]
    for k in range(2,max_num_clusters+1):
        start_time = time.time()
        kmean=KMeans(n_clusters=k).fit(distmat_input.T)
        exec_time=time.time() - start_time
        print exec_time, ' secs'
        kmean_labels=kmean.labels_
        kmean_validity,kmean_intra_dist,kmean_inter_dist=compute_cluster_err(distmat_input,kmean_labels)
        kmean_num_clusters=int(kmean_labels.max()+1)
        kmean_validity_log.append(kmean_validity);
        kmean_intra_dist_log.append(list(kmean_intra_dist));
        kmean_inter_dist_log.append(list(kmean_inter_dist));
        kmean_num_clusters_log.append(kmean_num_clusters)

    kmean_intra_dist_log=np.array(kmean_intra_dist_log);
    kmean_inter_dist_log=np.array(kmean_inter_dist_log)
    
    
    
    print 'Affinity Clustering'
    print '---------------------------'
    SIMM_MAT=2-distmat_input
    start_time = time.time()
    aff_exemplars, aff_labels = cluster.affinity_propagation(SIMM_MAT,damping=0.5)
    exec_time=time.time() - start_time
    print exec_time, ' secs'
    aff_num_clusters=int(aff_labels.max()+1)
    aff_validity,aff_intra_dist,aff_inter_dist=compute_cluster_err(distmat_input,aff_labels)
    
    
    fig = plt.figure('Intra_dist')
    fig.suptitle('Intra_dist')
    plot(pack_num_clusters,intra_dist[0],'s',label='pack')
    plot(pack_num_clusters,intra_dist[1],'s',label='pack')
    plot(pack_num_clusters,intra_dist[2],'s',label='pack')
    plot(ward_num_clusters_log,ward_intra_dist_log[:,0],'-+',label='ward')
    plot(ward_num_clusters_log,ward_intra_dist_log[:,1],'-+',label='ward')
    plot(ward_num_clusters_log,ward_intra_dist_log[:,2],'-+',label='ward')
    plot(kmean_num_clusters_log,kmean_intra_dist_log[:,0],'-v',label='kmean')
    plot(kmean_num_clusters_log,kmean_intra_dist_log[:,1],'-v',label='kmean')
    plot(kmean_num_clusters_log,kmean_intra_dist_log[:,2],'-v',label='kmean')
    plot(aff_num_clusters,aff_intra_dist[0],'*',label='aff')
    plot(aff_num_clusters,aff_intra_dist[1],'*',label='aff')
    plot(aff_num_clusters,aff_intra_dist[2],'*',label='aff')
    plt.legend()
    
    fig = plt.figure('Inter_dist')
    fig.suptitle('Inter_dist')
    plot(pack_num_clusters,inter_dist[0],'s',label='pack')
    plot(pack_num_clusters,inter_dist[1],'s',label='pack')
    plot(pack_num_clusters,inter_dist[2],'s',label='pack')
    plot(ward_num_clusters_log,ward_inter_dist_log[:,0],'-+',label='ward')
    plot(ward_num_clusters_log,ward_inter_dist_log[:,1],'-+',label='ward')
    plot(ward_num_clusters_log,ward_inter_dist_log[:,2],'-+',label='ward')
    plot(kmean_num_clusters_log,kmean_inter_dist_log[:,0],'-v',label='kmean')
    plot(kmean_num_clusters_log,kmean_inter_dist_log[:,1],'-v',label='kmean')
    plot(kmean_num_clusters_log,kmean_inter_dist_log[:,2],'-v',label='kmean')
    plot(aff_num_clusters,aff_inter_dist[0],'*',label='aff')
    plot(aff_num_clusters,aff_inter_dist[1],'*',label='aff')
    plot(aff_num_clusters,aff_inter_dist[2],'*',label='aff')
    plt.legend()
    
    fig = plt.figure('Validity')
    fig.suptitle('Validity')
    plot(pack_num_clusters,validity,'s',label='pack')
    plot(ward_num_clusters_log,ward_validity_log,'-+',label='ward')
    plot(kmean_num_clusters_log,kmean_validity_log,'-v',label='kmean')
    plot(aff_num_clusters,aff_validity,'*',label='aff')
    plt.legend()

    aff_intra_err_cnt, aff_inter_err_cnt=check_bounded_distance_constraint_condition(distmat_input,aff_labels,min_dist_,max_dist_)        
    ward_intra_err_cnt, ward_inter_err_cnt=check_bounded_distance_constraint_condition(distmat_input,ward_labels,min_dist_,max_dist_)        
    kmean_intra_err_cnt, kmean_inter_err_cnt=check_bounded_distance_constraint_condition(distmat_input,kmean_labels,min_dist_,max_dist_)        
    pack_intra_err_cnt, pack_inter_err_cnt=check_bounded_distance_constraint_condition(distmat_input,pack_labels,min_dist_,max_dist_)        

    print 'error count'
    print '-----------------------------'
    print 'pack_intra_err_cnt:', pack_intra_err_cnt,   'pack_inter_err_cnt:', pack_inter_err_cnt
    print 'aff_intra_err_cnt:', aff_intra_err_cnt,     'aff_inter_err_cnt:', aff_inter_err_cnt
    print 'ward_intra_err_cnt:', ward_intra_err_cnt,   'ward_inter_err_cnt:', ward_inter_err_cnt
    print 'kmean_intra_err_cnt:', kmean_intra_err_cnt, 'kmean_inter_err_cnt:', kmean_inter_err_cnt
    
    print '==========================================================='
    print 'End of Clustering Test'
    print '==========================================================='   
コード例 #17
0
def kluster(form):

    try:

        tickerA = web.DataReader(form.tickerA + '.sa',
                                 data_source='yahoo')[-252:]
        tickerB = web.DataReader(form.tickerB + '.sa',
                                 data_source='yahoo')[-252:]
        tickerC = web.DataReader(form.tickerC + '.sa',
                                 data_source='yahoo')[-252:]
        tickerD = web.DataReader(form.tickerD + '.sa',
                                 data_source='yahoo')[-252:]
        tickerE = web.DataReader(form.tickerE + '.sa',
                                 data_source='yahoo')[-252:]

        barchart = [tickerA, tickerB, tickerC, tickerD, tickerE]

        names = [
            form.tickerA, form.tickerB, form.tickerC, form.tickerD,
            form.tickerE
        ]

        quotes = []

        for item in barchart:
            portfolio = pd.DataFrame(item)
            quotes.append(portfolio)

        names = pd.DataFrame(names).T
        opening_quotes = np.array([quote.Open
                                   for quote in quotes]).astype(np.float)
        closing_quotes = np.array([quote.Close
                                   for quote in quotes]).astype(np.float)

        delta_quotes = closing_quotes - opening_quotes

        edge_model = covariance.GraphLassoCV()

        X = delta_quotes.copy().T
        X /= X.std(axis=0)

        with np.errstate(invalid='ignore'):
            edge_model.fit(X)

        from sklearn import cluster

        _, labels = cluster.affinity_propagation(edge_model.covariance_)
        num_labels = labels.max()

        k = []

        for i in range(num_labels + 1):
            try:
                cluster = (i + 1, ', '.join(names.T[0][labels == i]))
                k.append(cluster)
            except Exception:
                pass  # or you could use 'continue'

        kluster = pd.DataFrame(list(k))
        kluster.columns = ['Cluster', 'Ticker']
        kluster = kluster.to_html(index=False, columns=['Cluster', 'Ticker'])

    except Exception:
        return render_to_response('project/apologies.html')

    return render_to_response('cluster.html', context={'kluster': kluster})
コード例 #18
0
def stock_structure_demo():
    start_date = datetime(2005, 1, 1).date()
    end_date = datetime(2008, 1, 1).date()
    symbol_dict = {
        'NYSE:TOT': 'Total',
        'NYSE:XOM': 'Exxon',
        'NYSE:CVX': 'Chevron',
        'NYSE:COP': 'ConocoPhillips',
        'NYSE:VLO': 'Valero Energy',
        'NASDAQ:MSFT': 'Microsoft',
        'NYSE:IBM': 'IBM',
        'NYSE:TWX': 'Time Warner',
        'NASDAQ:CMCSA': 'Comcast',
        'NYSE:CVC': 'Cablevision',
        'NASDAQ:YHOO': 'Yahoo',
        'NASDAQ:DELL': 'Dell',
        'NYSE:HPQ': 'HP',
        'NASDAQ:AMZN': 'Amazon',
        'NYSE:TM': 'Toyota',
        'NYSE:CAJ': 'Canon',
        'NYSE:SNE': 'Sony',
        'NYSE:F': 'Ford',
        'NYSE:HMC': 'Honda',
        'NYSE:NAV': 'Navistar',
        'NYSE:NOC': 'Northrop Grumman',
        'NYSE:BA': 'Boeing',
        'NYSE:KO': 'Coca Cola',
        'NYSE:MMM': '3M',
        'NYSE:MCD': 'McDonald\'s',
        'NYSE:PEP': 'Pepsi',
        'NYSE:K': 'Kellogg',
        'NYSE:UN': 'Unilever',
        'NASDAQ:MAR': 'Marriott',
        'NYSE:PG': 'Procter Gamble',
        'NYSE:CL': 'Colgate-Palmolive',
        'NYSE:GE': 'General Electrics',
        'NYSE:WFC': 'Wells Fargo',
        'NYSE:JPM': 'JPMorgan Chase',
        'NYSE:AIG': 'AIG',
        'NYSE:AXP': 'American express',
        'NYSE:BAC': 'Bank of America',
        'NYSE:GS': 'Goldman Sachs',
        'NASDAQ:AAPL': 'Apple',
        'NYSE:SAP': 'SAP',
        'NASDAQ:CSCO': 'Cisco',
        'NASDAQ:TXN': 'Texas Instruments',
        'NYSE:XRX': 'Xerox',
        'NYSE:WMT': 'Wal-Mart',
        'NYSE:HD': 'Home Depot',
        'NYSE:GSK': 'GlaxoSmithKline',
        'NYSE:PFE': 'Pfizer',
        'NYSE:SNY': 'Sanofi-Aventis',
        'NYSE:NVS': 'Novartis',
        'NYSE:KMB': 'Kimberly-Clark',
        'NYSE:R': 'Ryder',
        'NYSE:GD': 'General Dynamics',
        'NYSE:RTN': 'Raytheon',
        'NYSE:CVS': 'CVS',
        'NYSE:CAT': 'Caterpillar',
        'NYSE:DD': 'DuPont de Nemours',
        'NYSE:ABB': 'ABB'
    }
    symbols, names = np.array(sorted(symbol_dict.items())).T
    # retry is used because quotes_historical_google can temporarily fail
    # for various reasons (e.g. empty result from Google API).
    quotes = []
    for symbol in symbols:
        print('Fetching quote history for %r' % symbol, file=sys.stderr)
        quotes.append(
            retry(quotes_historical_google)(symbol, start_date, end_date))
    close_prices = np.vstack([q['close'] for q in quotes])
    open_prices = np.vstack([q['open'] for q in quotes])
    # The daily variations of the quotes are what carry most information
    variation = close_prices - open_prices
    # #############################################################################
    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()
    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery
    X = variation.copy().T
    X /= X.std(axis=0)
    edge_model.fit(X)
    # #############################################################################
    # Cluster using affinity propagation
    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()
    for i in range(n_labels + 1):
        print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

    # #############################################################################
    # Find a low-dimension embedding for visualization: find the best position of
    # the nodes (the stocks) on a 2D plane
    # We use a dense eigen_solver to achieve reproducibility (arpack is
    # initiated with random vectors that we don't control). In addition, we
    # use a large number of neighbors to capture the large-scale structure.
    node_position_model = manifold.LocallyLinearEmbedding(n_components=2,
                                                          eigen_solver='dense',
                                                          n_neighbors=6)
    embedding = node_position_model.fit_transform(X.T).T
    # #############################################################################
    # Visualization
    plt.figure(1, facecolor='w', figsize=(10, 8))
    plt.clf()
    ax = plt.axes([0., 0., 1., 1.])
    plt.axis('off')
    # Display a graph of the partial correlations
    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
    # Plot the nodes using the coordinates of our embedding
    plt.scatter(embedding[0],
                embedding[1],
                s=100 * d**2,
                c=labels,
                cmap=plt.cm.spectral)
    # Plot the edges
    start_idx, end_idx = np.where(non_zero)
    # a sequence of (*line0*, *line1*, *line2*), where::
    #            linen = (x0, y0), (x1, y1), ... (xm, ym)
    segments = [[embedding[:, start], embedding[:, stop]]
                for start, stop in zip(start_idx, end_idx)]
    values = np.abs(partial_correlations[non_zero])
    lc = LineCollection(segments,
                        zorder=0,
                        cmap=plt.cm.hot_r,
                        norm=plt.Normalize(0, .7 * values.max()))
    lc.set_array(values)
    lc.set_linewidths(15 * values)
    ax.add_collection(lc)
    # Add a label to each node. The challenge here is that we want to
    # position the labels to avoid overlap with other labels
    for index, (name, label,
                (x, y)) in enumerate(zip(names, labels, embedding.T)):

        dx = x - embedding[0]
        dx[index] = 1
        dy = y - embedding[1]
        dy[index] = 1
        this_dx = dx[np.argmin(np.abs(dy))]
        this_dy = dy[np.argmin(np.abs(dx))]
        if this_dx > 0:
            horizontalalignment = 'left'
            x = x + .002
        else:
            horizontalalignment = 'right'
            x = x - .002
        if this_dy > 0:
            verticalalignment = 'bottom'
            y = y + .002
        else:
            verticalalignment = 'top'
            y = y - .002
        plt.text(x,
                 y,
                 name,
                 size=10,
                 horizontalalignment=horizontalalignment,
                 verticalalignment=verticalalignment,
                 bbox=dict(facecolor='w',
                           edgecolor=plt.cm.spectral(label / float(n_labels)),
                           alpha=.6))
    plt.xlim(
        embedding[0].min() - .15 * embedding[0].ptp(),
        embedding[0].max() + .10 * embedding[0].ptp(),
    )
    plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
             embedding[1].max() + .03 * embedding[1].ptp())
    plt.show()
コード例 #19
0
    "WMT": "Wal-Mart",
    "WAG": "Walgreen",
    "HD": "Home Depot",
    "GSK": "GlaxoSmithKline",
    "PFE": "Pfizer",
    "SNY": "Sanofi-Aventis",
    "NVS": "Novartis",
    "KMB": "Kimberly-Clark",
    "R": "Ryder",
    "GD": "General Dynamics",
    "RTN": "Raytheon",
    "CVS": "CVS",
    "CAT": "Caterpillar",
    "DD": "DuPont de Nemours",
}

symbols, names = np.array(symbol_dict.items()).T

quotes = [finance.quotes_historical_yahoo(symbol, d1, d2, asobject=True) for symbol in symbols]

# volumes = np.array([q.volume for q in quotes]).astype(np.float)
open = np.array([q.open for q in quotes]).astype(np.float)
close = np.array([q.close for q in quotes]).astype(np.float)
variation = close - open
correlations = np.corrcoef(variation)

_, labels = cluster.affinity_propagation(correlations)

for i in range(labels.max() + 1):
    print "Cluster %i: %s" % ((i + 1), ", ".join(names[labels == i]))
コード例 #20
0
def test_affinity_propagation_affinity_shape():
    """Check the shape of the affinity matrix when using `affinity_propagation."""
    S = -euclidean_distances(X, squared=True)
    err_msg = "S must be a square array"
    with pytest.raises(ValueError, match=err_msg):
        affinity_propagation(S[:, :-1])
コード例 #21
0
            DIST_MAT[i,j]=sqrt(norm(sample1-sample2))
    cov_mat=COV_MAT
    
corr_mat=(np.diag(cov_mat)**(-0.5))*cov_mat*(np.diag(cov_mat)**(-0.5))


################################################################################
# Unsupervised clustering for sensors given the measurement correlation 
# Find only a few represetative sensors out of many sensors
################################################################################
# exemplars are a set of representative signals for each cluster
# Smaller dampding input will generate more clusers, default is 0.5
# 0.5 <= damping <=0.99
################################################################################
#exemplars, labels = cluster.affinity_propagation(cov_mat,damping=0.5)
exemplars, labels = cluster.affinity_propagation(cov_mat)
n_labels = labels.max()

for i in range(n_labels + 1):
    print('Cluster %i: %s' % ((i + 1), ', '.join(input_names[labels == i])))


###############################################################################
# Find a low-dimension embedding for visualization: find the best position of
# the nodes (the stocks) on a 2D plane

# We use a dense eigen_solver to achieve reproducibility (arpack is
# initiated with random vectors that we don't control). In addition, we
# use a large number of neighbors to capture the large-scale structure.
node_position_model = manifold.LocallyLinearEmbedding(
    n_components=2, eigen_solver='dense', n_neighbors=3)
コード例 #22
0
def showCovariances(names,variation):

    
    ###############################################################################
    # Learn a graphical structure from the correlations
    edge_model = covariance.GraphLassoCV()
    
    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery
    X = variation.copy().T
    X /= X.std(axis=0)
    edge_model.fit(X)
    
    ###############################################################################
    # Cluster using affinity propagation
    
    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()
    
    for i in range(n_labels + 1):
        print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))
    
    ###############################################################################
    # Find a low-dimension embedding for visualization: find the best position of
    # the nodes (the stocks) on a 2D plane
    
    # We use a dense eigen_solver to achieve reproducibility (arpack is
    # initiated with random vectors that we don't control). In addition, we
    # use a large number of neighbors to capture the large-scale structure.
    node_position_model = manifold.LocallyLinearEmbedding(
        n_components=2, eigen_solver='dense', n_neighbors=6)
    
    embedding = node_position_model.fit_transform(X.T).T
    
    ###############################################################################
    # Visualization
    plt.figure(1, facecolor='w', figsize=(10, 8))
    plt.clf()
    ax = plt.axes([0., 0., 1., 1.])
    plt.axis('off')
    
    # Display a graph of the partial correlations
    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
    
    # Plot the nodes using the coordinates of our embedding
    plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
                cmap=plt.cm.spectral)
    
    # Plot the edges
    start_idx, end_idx = np.where(non_zero)
    #a sequence of (*line0*, *line1*, *line2*), where::
    #            linen = (x0, y0), (x1, y1), ... (xm, ym)
    segments = [[embedding[:, start], embedding[:, stop]]
                for start, stop in zip(start_idx, end_idx)]
    values = np.abs(partial_correlations[non_zero])
    lc = LineCollection(segments,
                        zorder=0, cmap=plt.cm.hot_r,
                        norm=plt.Normalize(0, .7 * values.max()))
    lc.set_array(values)
    lc.set_linewidths(15 * values)
    ax.add_collection(lc)
    
    # Add a label to each node. The challenge here is that we want to
    # position the labels to avoid overlap with other labels
    for index, (name, label, (x, y)) in enumerate(
            zip(names, labels, embedding.T)):
    
        dx = x - embedding[0]
        dx[index] = 1
        dy = y - embedding[1]
        dy[index] = 1
        this_dx = dx[np.argmin(np.abs(dy))]
        this_dy = dy[np.argmin(np.abs(dx))]
        if this_dx > 0:
            horizontalalignment = 'left'
            x = x + .002
        else:
            horizontalalignment = 'right'
            x = x - .002
        if this_dy > 0:
            verticalalignment = 'bottom'
            y = y + .002
        else:
            verticalalignment = 'top'
            y = y - .002
        plt.text(x, y, name, size=10,
                 horizontalalignment=horizontalalignment,
                 verticalalignment=verticalalignment,
                 bbox=dict(facecolor='w',
                           edgecolor=plt.cm.spectral(label / float(n_labels)),
                           alpha=.6))
    
    plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
             embedding[0].max() + .10 * embedding[0].ptp(),)
    plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
             embedding[1].max() + .03 * embedding[1].ptp())
    
    plt.show()
コード例 #23
0
def StockMarketOLD():
    ###############################################################################
    # Retrieve the data from Internet

    # Choose a time period reasonnably calm (not too long ago so that we get
    # high-tech firms, and before the 2008 crash)
    d1 = datetime.datetime(2005, 1, 1)
    d2 = datetime.datetime(2009, 12, 31)

    # kraft symbol has now changed from KFT to MDLZ in yahoo
    symbol_dict = {
        'TOT': 'Total',
        'XOM': 'Exxon',
        'CVX': 'Chevron',
        'COP': 'ConocoPhillips',
        'VLO': 'Valero Energy',
        'MSFT': 'Microsoft',
        'IBM': 'IBM',
        'TWX': 'Time Warner',
        'CMCSA': 'Comcast',
        #'CVC': 'Cablevision',
        #'YHOO': 'Yahoo',
        #'DELL': 'Dell',
        'HPQ': 'HP',
        'AMZN': 'Amazon',
        'TM': 'Toyota',
        'CAJ': 'Canon',
        'MTU': 'Mitsubishi',
        'SNE': 'Sony',
        #'F': 'Ford',
        'HMC': 'Honda',
        #'NAV': 'Navistar',
        'NOC': 'Northrop Grumman',
        'BA': 'Boeing',
        'KO': 'Coca Cola',
        'MMM': '3M',
        'MCD': 'Mc Donalds',
        #'PEP': 'Pepsi',
        'MDLZ': 'Kraft Foods',
        'K': 'Kellogg',
        'UN': 'Unilever',
        'MAR': 'Marriott',
        'PG': 'Procter Gamble',
        'CL': 'Colgate-Palmolive',
        'GE': 'General Electrics',
        'WFC': 'Wells Fargo',
        'JPM': 'JPMorgan Chase',
        #'AIG': 'AIG',
        'AXP': 'American Express',
        'BAC': 'Bank of America',
        'GS': 'Goldman Sachs',
        'AAPL': 'Apple',
        'SAP': 'SAP',
        'CSCO': 'Cisco',
        'TXN': 'Texas Instruments',
        'XRX': 'Xerox',
        #'LMT': 'Lookheed Martin',
        'WMT': 'Wal-Mart',
        'WBA': 'Walgreen',
        'HD': 'Home Depot',
        'GSK': 'GlaxoSmithKline',
        'PFE': 'Pfizer',
        'SNY': 'Sanofi-Aventis',
        'NVS': 'Novartis',
        'KMB': 'Kimberly-Clark',
        'R': 'Ryder',
        'GD': 'General Dynamics',
        'RTN': 'Raytheon',
        'CVS': 'CVS',
        'CAT': 'Caterpillar',
        'DD': 'DuPont de Nemours',

        #'GM': 'General Motors',
        #'GOOG' : 'Google',
        'ORCL' : 'Oracle',
        'NVO':'Novo Nordisk',
        'LLY':'Eli Lilly and Company',
        #'FB':'Facebook',
        'MRK':'Merck Co',
        }
    '''
    symbol_dict = {'Danske.CO':'Danske Bank',
                   'Maersk-B.CO':'Maersk',
                   'DSV.CO':'DSV',
                   'FLS.CO':'FLS',
                   'Gen.CO':'Genmab',
                   'TDC.CO':'TDC',
                   'CARL-B.CO':'Carlsberg',
                   'CHR.CO':'Chr Hansen',
                   'COLO-B.CO':'Coloplast',
                   'GN.CO':'GN Store Nord',
                   'NDA-DKK.co':'Nordea',
                   'Novo-B.co':'Novo Nordisk',
                   'NZYM-B.CO':'Novozymes',
                   'PNDORA.CO':'Pandora',
                   'Tryg.co':'Tryg',
                   'VWS.CO':'Vestas',
                   'WDH.CO':'William Demant',
                   'G4s.co':'G4S',
                   'JYSK.CO':'Jyske Bank',
                   'KBHL.CO':'Kobenhavns Lufthavne',
                   'RBREW.CO':'Royal Unibrew',
                   'ROCK-B.CO':'Rockwool',
                   'SYDB.CO':'Sydbank',
                   'TOP.CO':'Topdanmark',
                   #'ALMB.CO':'Alm Brand',
                   'AURI-B.CO':'Auriga',
                   'Bava.CO':'Bavarian Nordic',
                   'BO.CO':'Bang Olufsen',
                   'DFDS.CO':'DFDS',
                   'DNORD.CO':'DS Norden',
                   'GES.CO':'Greentech',
                   'IC.CO':'IC Group',
                   'JDAN.CO':'Jeudan',
                   #'JUTBK.CO':'Jutlander Bank',
                   #'MATAS.CO':'Matas',
                   'NKT.CO':'NKT',
                   #'NNIT.CO':'NNIT',
                   'NORDJB.CO':'Nordjyske Bank',
                   #'ONXEO.CO':'Onxeo',
                   #'OSSR.CO':'Ossur',
                   'PAAL-B.CO':'Per Aarslef',
                   'RILBA.CO':'Ringkobing Landbobank',
                   'SAS-DKK.CO':'SAS',
                   'SCHO.CO':'Schouw Co.',
                   'SIM.CO':'SimCorp',
                   'Solar-B.co':'Solar B',
                   'SPNO.CO':'Spar Nord',
                   'TIV.CO':'Tivoli',
                   'UIE.CO':'UIE',
                   'VELO.CO':'Veloxis',
                   'ZEAL.CO':'Zealand Pharma'
                   }
    '''
    symbols, names = np.array(list(symbol_dict.items())).T

    for symbol in symbols:
        print symbol
        if len(pd.DataFrame(np.array([[q[5] for q in quotes_historical_yahoo(symbol,d1,d2,True,False)]]).T)) != 1259:
            print symbol, len(pd.DataFrame(np.array([[q[5] for q in quotes_historical_yahoo(symbol,d1,d2,True,False)]]).T))


    open = pd.DataFrame(np.array([[q[5] for q in quotes_historical_yahoo(symbol,d1,d2,True,False)] for symbol in symbols]).T)
    close = pd.DataFrame(np.array([[q[6] for q in quotes_historical_yahoo(symbol,d1,d2,True,False)] for symbol in symbols]).T)

    # The daily variations of the quotes are what carry most information
    variation = np.array(close - open)

    ###############################################################################
    # Learn a graphical structure from the correlations
    #edge_model = covariance.GraphLassoCV()


    # standardize the time series: using correlations rather than covariance
    # is more efficient for structure recovery


    df = pd.read_csv('data/TData9313_final5.csv',index_col=0)
    X = variation.copy()

    pd.DataFrame(np.round(np.cov(X.T),3),columns=symbols,index=symbols).to_latex('covariancetable.tex')

    print np.max(np.round(np.cov(X.T),3))

    X /= X.std(axis=0)

    covariance_,precision_ = graphical_lasso(X,0.3)

    print pd.DataFrame(precision_)

    #edge_model.fit(X)

    ###############################################################################
    # Cluster using affinity propagation

    _, labels = cluster.affinity_propagation(covariance_)

    n_labels = labels.max()

    for i in range(n_labels + 1):
        print('Cluster %i: %s' % ((i + 1), ', '.join(symbols[labels == i])))

    ###############################################################################
    # Find a low-dimension embedding for visualization: find the best position of
    # the nodes (the stocks) on a 2D plane

    # We use a dense eigen_solver to achieve reproducibility (arpack is
    # initiated with random vectors that we don't control). In addition, we
    # use a large number of neighbors to capture the large-scale structure.
    node_position_model = manifold.LocallyLinearEmbedding(
        n_components=2, eigen_solver='dense', n_neighbors=6)

    embedding = node_position_model.fit_transform(X.T).T

    ###############################################################################
    # Visualization
    plt.figure(1, facecolor='w', figsize=(20, 16))
    plt.clf()
    ax = plt.axes([0., 0., 1., 1.])
    plt.axis('off')

    plt.annotate('From %s to %s' % (d1.strftime('%Y-%m-%d'),d2.strftime('%Y-%m-%d')),xy=(0.11,-0.37),size=25)

    print X.shape

    for i in range(n_labels + 1):
        plt.annotate('Cluster %i: %s' % ((i + 1), ', '.join(symbols[labels == i])),xy=(-0.43,0.02-i*0.02),size=18)
        pass



    # Display a graph of the partial correlations
    #partial_correlations = edge_model.precision_.copy()
    partial_correlations = precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

    # Plot the nodes using the coordinates of our embedding
    plt.scatter(embedding[0], embedding[1], s=200 * d ** 2, c=labels,
                cmap=plt.cm.spectral)

    # Plot the edges
    start_idx, end_idx = np.where(non_zero)
    #a sequence of (*line0*, *line1*, *line2*), where::
    #            linen = (x0, y0), (x1, y1), ... (xm, ym)
    segments = [[embedding[:, start], embedding[:, stop]]
                for start, stop in zip(start_idx, end_idx)]
    values = np.abs(partial_correlations[non_zero])
    lc = LineCollection(segments,
                        zorder=0, cmap=plt.get_cmap('Greys'),
                        norm=plt.Normalize(0, .7 * values.max()))
    lc.set_array(values)
    lc.set_linewidths(15 * values)
    ax.add_collection(lc)

    # Add a label to each node. The challenge here is that we want to
    # position the labels to avoid overlap with other labels
    for index, (name, label, (x, y)) in enumerate(
            zip(names, labels, embedding.T)):

        dx = x - embedding[0]
        dx[index] = 1
        dy = y - embedding[1]
        dy[index] = 1
        this_dx = dx[np.argmin(np.abs(dy))]
        this_dy = dy[np.argmin(np.abs(dx))]
        if this_dx > 0:
            horizontalalignment = 'left'
            x = x + .002
        else:
            horizontalalignment = 'right'
            x = x - .002
        if this_dy > 0:
            verticalalignment = 'bottom'
            y = y + .002
        else:
            verticalalignment = 'top'
            y = y - .002
        plt.text(x, y, name, size=22,
                 horizontalalignment=horizontalalignment,
                 verticalalignment=verticalalignment,
                 bbox=dict(facecolor='w',
                           edgecolor=plt.cm.spectral(label / float(n_labels)),
                           alpha=.6))

    plt.xlim(embedding[0].min() - .25 * embedding[0].ptp(),
             embedding[0].max() + .20 * embedding[0].ptp(),)
    plt.ylim(embedding[1].min() - .20 * embedding[1].ptp(),
             embedding[1].max() + .20 * embedding[1].ptp())

    plt.savefig('Graphs/StockCluster.pdf',bbox_inches='tight')
    plt.savefig('Graphs/StockCluster.svg',bbox_inches='tight')
    plt.show()
コード例 #24
0
#%% cluster customers by weekly sales

# get weekly sales matrix and remove customers with missing data
by_cust_week = db.groupby(["CUST_NUM", "W"], as_index=False).sum()
sales_series_cust = by_cust_week.pivot(index="W", columns="CUST_NUM", values="SLSAMT")
sales_series_cust.fillna(0, inplace=True)
sales_series_cust2 = sales_series_cust.loc[:, (sales_series_cust != 0).any(axis=0)]

# keep track of which customers were removed
all_customers = db["CUST_NUM"].unique()
cust_to_remove = (sales_series_cust.sum() == 0).nonzero()[0]
customers_used = np.delete(all_customers, cust_to_remove)

# cluster using covariance matrix as similarity matrix
sales_cov = sales_series_cust2.cov()
_, cust_labels = cluster.affinity_propagation(sales_cov)

# keep cluster information
cust_to_group = dict(zip(customers_used, cust_labels))

cluster_dictionary = {}  # maps clusters to all customers (index in customers_used) in them
for index, label in enumerate(cust_labels):
    if label not in cluster_dictionary:
        cluster_dictionary[label] = [index]
    else:
        cluster_dictionary[label].append(index)

main_clusters = []  # clusters with more than one customer
for key in cluster_dictionary.keys():
    if len(cluster_dictionary[key]) > 1:
        main_clusters.append((key, len(cluster_dictionary[key]), customers_used[cluster_dictionary[key][0]]))
コード例 #25
0
def cluster_affinity(graph):
    affinity = cluster.affinity_propagation(S=nx.to_numpy_matrix(graph),
                                            max_iter=200, damping=0.6)
    return list(affinity[1])
コード例 #26
0
def graphicalAnalysis(dataset,
                      start_date='2000-01-01',
                      end_date='2020-05-31',
                      Sectors_chosen=[],
                      drop_firm=[],
                      display_SumStat=True,
                      display_IndRet=True,
                      data_rf=df_rf):

    # Check if the inputed date are legit
    if (datetime.strptime(start_date, "%Y-%m-%d") > datetime.strptime(
            end_date, "%Y-%m-%d")):
        print(
            'ERROR: Revision needed! The entered \"start_date\" should be before \"end_date\".'
        )
        return 0, 0
    if (dataset.index[0] - timedelta(days=dataset.index[0].weekday()) >
            datetime.strptime(start_date, "%Y-%m-%d")):
        print(
            'WARNING: the entered \"start_date\" is outside of the range for the given dataset.'
        )
        print(
            'The \"start_date\" is adjusted to the earliest start_date, i.e. ',
            (dataset.index[0] -
             timedelta(days=dataset.index[0].weekday())).strftime("%Y-%m-%d"))
        print()
    if (dataset.index[-1] < datetime.strptime(end_date, "%Y-%m-%d")):
        print(
            'WARNING: the entered \"end_date\" is outside of the range for the given dataset.'
        )
        print('The \"end_date\" is adjusted to the lastest end_date, i.e. ',
              dataset.index[-1].strftime("%Y-%m-%d"))
        print()

    # Extract the data for the given time period
    temp = dataset[dataset.index >= start_date].copy()
    X = temp[temp.index <= end_date].copy()
    temp = data_rf[data_rf.index >= start_date].copy()
    data_rf2 = temp[temp.index <= end_date].copy()

    # Check if we are using all sectors or dropping some sector
    if ((not Sectors_chosen) == False):
        if (all([(s in firms_info.Sector.unique()) for s in Sectors_chosen])):
            f_in_sector_chosen = []
            for s in Sectors_chosen:
                f_in_sector_chosen += list(
                    firms_info[firms_info.Sector == s].index)
            X = X[f_in_sector_chosen]
            print('Sectors choosen in the Graphical Analysis are:')
            print(Sectors_chosen)
            print()
        else:
            print(
                'ERROR: Revision needed! At Least 1 Sector entered in the \"Sectors_choosen\" option is NOT in the dataset!'
            )
            print('Check your format!')
            return 0, 0

    # Check if we are using all firm or dropping some firms
    if ((not drop_firm) == False):
        if (all([(f in X.columns) for f in drop_firm])):
            print('The following Firms are dropped:')
            print(drop_firm)
            print()
            X.drop(columns=drop_firm, inplace=True)
        else:
            print(
                'ERROR: Revision needed! At Least 1 firm entered in the \"drop_firm\" option is NOT in the dataset!'
            )
            print('Check your format!')
            return 0, 0

    # Check if there is NA in the dataset within the given time period
    # If yes, then drop those firms before doing graphical analysis
    if (X.isnull().values.any()):
        print('WARNING: Some firms have missing data during this time period!')
        print('Dropping firms: ')
        for Xcol_dropped in list(X.columns[X.isna().any()]):
            print(Xcol_dropped)
        X = X.dropna(axis='columns')
        print()

    # Get the Start and End date of the dataset
    date_obj = X.index[0]
    start_of_week = date_obj - timedelta(days=date_obj.weekday())
    start = start_of_week.strftime("%m/%d/%Y")
    end = X.index[-1].strftime("%m/%d/%Y")

    # Get the firm names of the dataset
    names = np.array(list(X.columns))

    # Show the number of firms examined
    print('Number of firms examined:', X.shape[1])

    # #############################################################################
    # Learn a graphical structure from the correlations

    # Graphical Lasso is used here to estimate the precision matrix
    edge_model = covariance.GraphicalLassoCV(max_iter=1000)

    # standardize the time series:
    # using correlations rather than covariance is more efficient for structure recovery
    X_std = X / X.std(axis=0)
    edge_model.fit(X_std)

    # #############################################################################
    # Cluster using affinity propagation

    _, labels = cluster.affinity_propagation(edge_model.covariance_)
    n_labels = labels.max()

    for i in range(n_labels + 1):
        print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

    # #############################################################################
    # Find a low-dimension embedding for visualization: find the best position of
    # the nodes (the stocks) on a 2D plane

    node_position_model = manifold.MDS(n_components=2, random_state=0)
    embedding = node_position_model.fit_transform(X_std.T).T

    # #############################################################################
    # Visualization I

    # Specify node colors by cluster labels
    color_list = pl.cm.jet(np.linspace(0, 1, n_labels + 1))
    my_colors = [color_list[i] for i in labels]

    # Compute the partial correlations
    partial_correlations = edge_model.precision_.copy()
    d = 1 / np.sqrt(np.diag(partial_correlations))
    partial_correlations *= d
    partial_correlations *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

    # Compute the edge values based on the partial correlations
    values = np.abs(partial_correlations[non_zero])
    val_max = values.max()

    # Display the partial correlation graph
    graphicalAnalysis_plot(d, partial_correlations, my_colors, names, labels,
                           embedding, val_max, title)

    # The configuration of the plot
    plot_config = [
        d, partial_correlations, my_colors, names, labels, embedding, val_max,
        title
    ]

    # #############################################################################
    # Visualization II

    # For individual firm performance over the given period
    if (display_IndRet):
        print('Individual Stock Performance over the Period ' + start +
              ' to ' + end + ' (Weekly):')
        l_r = int(np.ceil(len(names) / 4))
        l_c = 4
        f_hei = l_r * 2.5
        f_wid = l_c * 4
        ax = (X + 1).cumprod().plot(subplots=True,
                                    layout=(l_r, l_c),
                                    figsize=(f_wid, f_hei),
                                    logy=True,
                                    sharex=True,
                                    sharey=True,
                                    x_compat=True,
                                    color=my_colors)
        for i in range(l_c):
            ax[0, i].xaxis.set_tick_params(which='both',
                                           top=True,
                                           labeltop=True,
                                           labelrotation=40)
        plt.show()

    # #############################################################################
    # Show summary statistics for each firm over the given period
    if (display_SumStat):
        display(getSumStat(X, rf=data_rf2['T-Bill']))

    return [edge_model.covariance_, edge_model.precision_], plot_config
コード例 #27
0
ファイル: flasky.py プロジェクト: vincenzodentamaro/jbash
def getStockMarketStructure(symbol_dict):
 	
# Choose a time period reasonnably calm (not too long ago so that we get
# high-tech firms, and before the 2008 crash)
	d1 = datetime.datetime(2009, 1, 1)
	d2 = datetime.datetime(2011, 1, 1)
#d1 = datetime.datetime.now() - timedelta(days=365*2)
#d2 = datetime.datetime.now()- timedelta(days=1)
# kraft symbol has now changed from KFT to MDLZ in yahoo
        symbols, names = np.array(list(symbol_dict.items())).T

        quotes = [finance.quotes_historical_yahoo(symbol, d1, d2, asobject=True)
          for symbol in symbols]

        open = np.array([q.open for q in quotes]).astype(np.float)
        close = np.array([q.close for q in quotes]).astype(np.float)

# The daily variations of the quotes are what carry most information
        variation = close - open

###############################################################################
# Learn a graphical structure from the correlations
        edge_model = covariance.GraphLassoCV()

# standardize the time series: using correlations rather than covariance
# is more efficient for structure recovery
        X = variation.copy().T
        X /= X.std(axis=0)
        edge_model.fit(X)

###############################################################################
# Cluster using affinity propagation

        _, labels = cluster.affinity_propagation(edge_model.covariance_)
        n_labels = labels.max()

        for i in range(n_labels + 1):
            print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

###############################################################################
# Find a low-dimension embedding for visualization: find the best position of
# the nodes (the stocks) on a 2D plane

# We use a dense eigen_solver to achieve reproducibility (arpack is
# initiated with random vectors that we don't control). In addition, we
# use a large number of neighbors to capture the large-scale structure.
        node_position_model = manifold.LocallyLinearEmbedding(
            n_components=2, eigen_solver='dense', n_neighbors=6)

        embedding = node_position_model.fit_transform(X.T).T

###############################################################################
# Visualization
        plt.figure(1, facecolor='w', figsize=(10, 8))
        plt.clf()
        ax = plt.axes([0., 0., 1., 1.])
        plt.axis('off')
# Display a graph of the partial correlations
        partial_correlations = edge_model.precision_.copy()
        d = 1 / np.sqrt(np.diag(partial_correlations))
        partial_correlations *= d
        partial_correlations *= d[:, np.newaxis]
        non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
# Plot the nodes using the coordinates of our embedding
        plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
                    cmap=plt.cm.spectral)
# Plot the edges
        start_idx, end_idx = np.where(non_zero)
#a sequence of (*line0*, *line1*, *line2*), where::
#            linen = (x0, y0), (x1, y1), ... (xm, ym)
        segments = [[embedding[:, start], embedding[:, stop]]
                    for start, stop in zip(start_idx, end_idx)]
        values = np.abs(partial_correlations[non_zero])
        lc = LineCollection(segments,
                            zorder=0, cmap=plt.cm.hot_r,
                            norm=plt.Normalize(0, .7 * values.max()))
        lc.set_array(values)
        lc.set_linewidths(15 * values)
        ax.add_collection(lc)
# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels
        for index, (name, label, (x, y)) in enumerate(
                zip(names, labels, embedding.T)):

            dx = x - embedding[0]
            dx[index] = 1
            dy = y - embedding[1]
            dy[index] = 1
            this_dx = dx[np.argmin(np.abs(dy))]
            this_dy = dy[np.argmin(np.abs(dx))]
            if this_dx > 0:
                horizontalalignment = 'left'
                x = x + .002
            else:
                horizontalalignment = 'right'
                x = x - .002
            if this_dy > 0:
                verticalalignment = 'bottom'
                y = y + .002
            else:
                verticalalignment = 'top'
                y = y - .002
	
            plt.text(x, y, name, size=10,
                    horizontalalignment=horizontalalignment,
                    verticalalignment=verticalalignment,
                    bbox=dict(facecolor='w',
                            edgecolor=plt.cm.spectral(label / float(n_labels)),
                            alpha=.6))
        plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
                embedding[0].max() + .10 * embedding[0].ptp(),)
        plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
                embedding[1].max() + .03 * embedding[1].ptp())
#plt.show()
        filename_1 = id_generator()+'.svg'
        plt.savefig(filename_1) 
        return filename_1
コード例 #28
0

# #############################################################################
# Learn a graphical structure from the correlations
edge_model = covariance.GraphLassoCV()

# standardize the time series: using correlations rather than covariance
# is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
edge_model.fit(X)

# #############################################################################
# Cluster using affinity propagation

_, labels = cluster.affinity_propagation(edge_model.covariance_)
n_labels = labels.max()

for i in range(n_labels + 1):
    print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

# #############################################################################
# Find a low-dimension embedding for visualization: find the best position of
# the nodes (the stocks) on a 2D plane

# We use a dense eigen_solver to achieve reproducibility (arpack is
# initiated with random vectors that we don't control). In addition, we
# use a large number of neighbors to capture the large-scale structure.
node_position_model = manifold.LocallyLinearEmbedding(
    n_components=2, eigen_solver='dense', n_neighbors=6)
コード例 #29
0
def clusterSymbol(dbdf):
    global dflength
    saveType = False
    try:

        book_kosdaq = xlrd.open_workbook("../../Kosdaq_symbols.xls")
        sheet_kosdaq = book_kosdaq.sheet_by_name('kosdaq')

        book_kospi = xlrd.open_workbook('../../Kospi_Symbols.xls')
        sheet_kospi = book_kospi.sheet_by_name('kospi')

        quotes2 = []
        nametitles = []
        codearrs = []
        titlefound = False
        for title in dbdf['title']:
            if ' ' in title:
                title  = title.replace(' ','')
            if '&' in title:
                title  = title.replace('&','and')
            if '-' in title:
                title  = title.replace('-','')    
            print 'title',title
            for cnt in range(sheet_kospi.nrows):
            
                if sheet_kospi.row_values(cnt)[1] == title:
                    
                    code = '{0:06d}'.format(int(sheet_kospi.row_values(cnt)[0]))
                    name = sheet_kospi.row_values(cnt)[1]
                    print code,name
                    markettype = 1
                    titlefound = True
                    break

            for cnt in range(sheet_kosdaq.nrows):
                
                if sheet_kosdaq.row_values(cnt)[1] == title:
                    
                    code = '{0:06d}'.format(int(sheet_kosdaq.row_values(cnt)[0]))
                    name = sheet_kosdaq.row_values(cnt)[1]
                    print code,name
                    markettype = 2
                    titlefound = True
                    break  

            if titlefound == False:
                continue   
            titlefound = False         
            try:        
                startdatemode = 2
                dbtradinghist = 'none'
                histmode = 'none'
                plotly = 'plotly'
                stdmode = 'stddb'
                tangentmode = 'tangentdb'        
                daych  =0
                runcount = 0
                srcsite = 1#google
                # srcsite = 2#yahoo
                writedblog = 'none'
                updbpattern = 'none'
                appenddb = 'none'

                print 'found code',code, name
                bars = cluster_fetchData(str(code),markettype,name,'realtime','dbpattern',histmode,runcount,srcsite,writedblog,updbpattern\
                                        ,appenddb,startdatemode,\
                                         dbtradinghist,plotly,stdmode,'none',daych,tangentmode)
                
                # bars = bars[1:]

                if dflength == 0:
                    dflength = len(bars)
                else:
                    if dflength > len(bars):
                        dflength = len(bars)
                
                quotes2.append(bars)
                nametitles.append(name)
                codearrs.append(code)
                clear_output()
            except Exception,e:
                # print 'error title',name
                pass

        npquotesOpen = []  
        npquotesClose = []   
        count = 0
        for q in quotes2:
            # print q.tail()
            # print pd.isnull(q).any().any()
            # if pd.isnull(q).any().any() == True:
            #     print 'NaN'
            #     continue
            q = q.fillna(0)

            if dflength < len(q):
                q = q[:dflength]
                npquotesOpen.append(q['Open'].values)
                npquotesClose.append(q['Close'].values)
                # print q['Close'].values,'count',count,len(q)    
            else:
                npquotesOpen.append(q['Open'].values)
                npquotesClose.append(q['Close'].values)
                # print q['Close'].values,'count',count,len(q)
            count += 1
            # print len(q.values),'dflength',dflength
        open2 = np.array(npquotesOpen).astype(np.float)         
        close2 = np.array(npquotesClose).astype(np.float)         
        # npquotesClose = []        
        # for q in quotes2:
        #     npquotesClose.append(q['Close'].values)
        # npquotesOpen = np.array([q['Open'].values for q in quotes2])
        # open2 =  npquotesOpen
        # npquotesClose = np.array([q['Close'].values for q in quotes2])
        # close2 =  npquotesClose
        # print npquotesOpen
        # print npquotesClose
        
        variation = (close2 - open2)
        
        symbol_dict = dict(zip(codearrs,nametitles))

        symbols, names = np.array(symbol_dict.items()).T

        edge_model = covariance.GraphLassoCV()

        # standardize the time series: using correlations rather than covariance
        # is more efficient for structure recovery
        tempX = variation.T
        # print tempX,'tempX len',len(tempX)
        X = variation.copy().T
        # print 'open len',len(open2),'close len',len(close2),'variation len',len(variation),'X len',len(X)
        print 'type open',type(open2),'type close',type(close2),'type variation',type(variation),'type X',type(X)
        print 'shape open',open2.shape,'shape close',close2.shape,'shape variation',variation.shape,'shape X',X.shape

        
        X /= X.std(axis=0)
        edge_model.fit(X)

        # ###############################################################################
        # # Cluster using affinity propagation

        _, labels = cluster.affinity_propagation(edge_model.covariance_)
        n_labels = labels.max()

        # print names
        # print 'type symbols',type(symbols),'type names',type(names)
        # for name in names:
        #     print 'name',name
        # print names[0],names[1],names[2],names[3]
        # print 'lables',labels,'n_labels',n_labels,'type labels',type(labels)

        randomtitles = pd.DataFrame()
        for i in range(n_labels+1):
            # print labels == i
            print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))
            if 1 < len(names[labels==i]) <= 3:
                # print 'random cluster ',np.random.choice(names[labels==i],3)
                tmpdf = pd.DataFrame({'title':np.random.choice(names[labels==i],1)})
                randomtitles = pd.concat([tmpdf, randomtitles])
            elif 3 < len(names[labels==i]) <= 5:
                tmpdf = pd.DataFrame({'title':np.random.choice(names[labels==i],2)})
                randomtitles = pd.concat([tmpdf, randomtitles])
            elif 5 < len(names[labels==i]) <= 7:
                tmpdf = pd.DataFrame({'title':np.random.choice(names[labels==i],4)})
                randomtitles = pd.concat([tmpdf, randomtitles])    
            elif 7 < len(names[labels==i]) :
                tmpdf = pd.DataFrame({'title':np.random.choice(names[labels==i],5)})
                randomtitles = pd.concat([tmpdf, randomtitles])        
                # print randomtitles

        # for i in range(n_labels + 1):
        #     print 'Cluster '+str(i + 1)+', '+ names[labels == i]
        
        # ###############################################################################
        # Find a low-dimension embedding for visualization: find the best position of
        # the nodes (the stocks) on a 2D plane

        # We use a dense eigen_solver to achieve reproducibility (arpack is
        # initiated with random vectors that we don't control). In addition, we
        # use a large number of neighbors to capture the large-scale structure.
        node_position_model = manifold.LocallyLinearEmbedding(
            n_components=2, eigen_solver='dense', n_neighbors=6)

        embedding = node_position_model.fit_transform(X.T).T

        # ###############################################################################
        # Visualization
        pl.figure(1, facecolor='w', figsize=(15, 15))
        pl.clf()
        ax = pl.axes([0., 0., 1., 1.])
        pl.axis('off')

        # Display a graph of the partial correlations
        partial_correlations = edge_model.precision_.copy()
        d = 1 / np.sqrt(np.diag(partial_correlations))
        partial_correlations *= d
        partial_correlations *= d[:, np.newaxis]
        non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

        # Plot the nodes using the coordinates of our embedding
        pl.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
                   cmap=pl.cm.spectral)

        # Plot the edges
        start_idx, end_idx = np.where(non_zero)
        #a sequence of (*line0*, *line1*, *line2*), where::
        #            linen = (x0, y0), (x1, y1), ... (xm, ym)
        segments = [[embedding[:, start], embedding[:, stop]]
                    for start, stop in zip(start_idx, end_idx)]
        values = np.abs(partial_correlations[non_zero])
        lc = LineCollection(segments,
                            zorder=0, cmap=pl.cm.hot_r,
                            norm=pl.Normalize(0, .7 * values.max()))
        lc.set_array(values)
        lc.set_linewidths(15 * values)
        ax.add_collection(lc)

        # Add a label to each node. The challenge here is that we want to
        # position the labels to avoid overlap with other labels
        for index, (name, label, (x, y)) in enumerate(
                zip(names, labels, embedding.T)):

            dx = x - embedding[0]
            dx[index] = 1
            dy = y - embedding[1]
            dy[index] = 1
            this_dx = dx[np.argmin(np.abs(dy))]
            this_dy = dy[np.argmin(np.abs(dx))]
            if this_dx > 0:
                horizontalalignment = 'left'
                x = x + .002
            else:
                horizontalalignment = 'right'
                x = x - .002
            if this_dy > 0:
                verticalalignment = 'bottom'
                y = y + .002
            else:
                verticalalignment = 'top'
                y = y - .002
            pl.text(x, y, name, size=10,
                    horizontalalignment=horizontalalignment,
                    verticalalignment=verticalalignment,
                    bbox=dict(facecolor='w',
                              edgecolor=pl.cm.spectral(label / float(n_labels)),
                              alpha=.6))

        pl.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
                embedding[0].max() + .10 * embedding[0].ptp(),)
        pl.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
                embedding[1].max() + .03 * embedding[1].ptp())

        pl.show()
        
        return randomtitles