def testLostFunctionReference(self):
     "test for bug #1727558"
     cl = KMeansClustering([(1, 1), (20, 40), (20, 41)], lambda x, y: x + y)
     clusters = cl.getclusters(3)
     expected = [(1, 1), (20, 40), (20, 41)]
     self.assertTrue(compare_list(clusters, expected),
                     "Elements differ!\n%s\n%s" % (clusters, expected))
Ejemplo n.º 2
0
    def mk_clusters(self, points, k, clusters):

        cl = KMeansClustering(points, None)
        res = cl.getclusters(k)

        for c in res:
            count = len(c)

            if count <= 2:
                clusters.append(c)
            elif count <= 9:

                if type(c) == types.ListType:
                    tmp = self.mk_clusters(c, 2, [])

                    if len(tmp) > 1:
                        p1 = Point(tmp[0])
                        p2 = Point(tmp[1])
                        d = p1.centroid.distance(p2.centroid)

                        if d >= 0.025:
                            for c in tmp:
                                clusters.append(c)
                            continue
                clusters.append(c)
            else:
                clusters = self.mk_clusters(c, 2, clusters)

        return clusters
Ejemplo n.º 3
0
    def getFlickrClusters(self, place = 'san francisco'):
        start = time.clock()

        
        clusterData = []
        for photo in self.flickr.find({"place":place}):
            lat = photo['loc'][0]
            lon = photo['loc'][1]
            clusterData.append((lat,lon))
        
        cl = KMeansClustering(clusterData)
        clusters = cl.getclusters(self.clusterNum)
        elapsed = (time.clock() - start)   
        print "getFlickrCluster finished in %s seconds" % elapsed
        
        #only return center of clusters
        resData = "["
        for cluster in clusters:
            latTotal = 0
            lonTotal = 0
            for point in cluster:
                latTotal += point[0]
                lonTotal += point[1]
            centerLat = latTotal/len(cluster)
            centerLon = lonTotal/len(cluster)
            resData = "%s(%s,%s)," % (resData, centerLat, centerLon)
        if resData.endswith(","):resData = resData[:-1] #remove the trailing comma
        resData = "%s]"%resData
        
        self.response.write(resData)
Ejemplo n.º 4
0
        def mk_clusters (self, points, k, clusters) :

                cl = KMeansClustering(points, None)
                res = cl.getclusters(k)

                for c in res :
                        count = len(c)

                        if count <= 2 :
                                clusters.append(c)
                        elif count <= 9 :
                                
                                if type(c) == types.ListType :
                                        tmp = self.mk_clusters(c, 2, [])
                                        
                                        if len(tmp) > 1 :
                                                p1 = Point(tmp[0])
                                                p2 = Point(tmp[1])
                                                d = p1.centroid.distance(p2.centroid)
                                                
                                                if d >= 0.025 :
                                                        for c in tmp :
                                                                clusters.append(c)
                                                        continue
                                clusters.append(c)
                        else :
                                clusters = self.mk_clusters(c, 2, clusters)
                
                return clusters
Ejemplo n.º 5
0
def cluster_stations(stations, empty='empty'):
    """Uses the cluster library to perform kmeans geographical kmeans
    clustering on the input stations list. Expects a list of that
    format returned by prep_stations, and returns a list similar
    to the input list with the cluster number of each element added
    """
    if empty == 'empty':
        tocluster = [i for i in stations if (i[3] - i[2])/float(i[3]) < .2]
    else:
        tocluster = [i for i in stations if (i[2])/float(i[3]) < .2]
    cl = KMeansClustering([(i[4], i[5]) for i in tocluster])
    clusters = cl.getclusters(4)
    
    # Note that this returns a list of lists of lat/long tuples. We're
    # going to have to re-associate them back to the rest of the stations
    
    clustered = []
    for ix, i in enumerate(clusters):
        for j in i:
            for k in tocluster:
                if (j[0], j[1]) == (k[4], k[5]):
                    clustered.append([k[0], k[1], k[2],
                        k[3], k[4], k[5], ix+1])

    return clustered
Ejemplo n.º 6
0
def buildKcluster(data,k):
	"""
	Description:Build K-mean Cluster
	Input:
			data: e.g. data = [	[12,12],[34,34],
								[23,23],[32,32],
								[46,46],[96,96],
								[13,13],[1,1],
								[4,4],[9,9]] 
								# The first variable is key, not counted for clustering
						k: number of cluster
	Output: cluster record file  /searchc/save/K.cluster
	"""
	print "Clustering..."
	a = datetime.datetime.now()
	cl = KMeansClustering(data,distance_function)
	clusterK =  cl.getclusters(k)	     			# get k clusters
	b = datetime.datetime.now()
	print "Naming..."
	featureAll = readFeature('all')
	c = nameCluster(clusterK,featureAll)
	name = c[0]
	centroid = c[1]
	writeCluster('K',clusterK,name,centroid,k)
	print "Writting log..."
	with open(path+'/log/K_'+str(k)+'.log','w') as outfile:
		outfile.write("KMean Clustering Log\nDate:\t"+str(a.date())+"\nStart:\t"+str(a.time())+"\nEnd:\t"+str(b.time())+"\nDuration:\t"+str(b-a)+"\nK:\t"+str(k)+"\nNo. cluster:\t"+str(len(clusterK)))
		for cluster in clusterK:
			outfile.write(str(len(cluster)-2)+"\n")
	return
Ejemplo n.º 7
0
 def testNumpyRandom(self):
     from cluster import KMeansClustering
     from numpy import random as rnd
     data = rnd.rand(500, 2)
     cl = KMeansClustering(data, lambda p0, p1: (
         p0[0] - p1[0]) ** 2 + (p0[1] - p1[1]) ** 2, numpy.array_equal)
     cl.getclusters(10)
Ejemplo n.º 8
0
 def cluster(self, clusters):
     """
     clusters is the final numbers of clusters
     """
     cl = KMeansClustering(map(lambda x: (x), range(len(self.__tags))), self.__tagDistance)
     self.__clusters = cl.getclusters(clusters)
     return self.__clusters
Ejemplo n.º 9
0
	def tod_clusters(self):
		"""Returns Timespans"""
		
		try: # get tod clustering data
			tod_data = self.time_of_day_data()
		except ApiException:
			deauthenticate()
		
		# determine tod clusters
		kmcl = KMeansClustering(tod_data)
		clusters = kmcl.getclusters(10)
		#todo: kmeans is paralelizable, so I could use MapReduce
		#todo: move this to an ajax call
		
		# format tod groups
		groups=[]
		for cl in clusters:
			tod_max=max([i[0] for i in cl])
			tod_min=min([i[0] for i in cl])
			groups.append(dict(
				len=len(cl),
				tod_avg=sum([i[0] for i in cl])/len(cl),
				tod_min=tod_min,
				start=pretty_tod(tod_min),
				end=pretty_tod(tod_max),
				width=int(floor((tod_max-tod_min)/(60*60*24/100))),
				left=int(floor(tod_min/(60*60*24/100)))
			))
		
		groups=sorted(groups, key=lambda k: k['tod_min'])
Ejemplo n.º 10
0
 def testMultidimArray(self):
     from random import random
     data = []
     for _ in range(200):
         data.append([random(), random()])
     cl = KMeansClustering(
         data, lambda p0, p1: (p0[0] - p1[0])**2 + (p0[1] - p1[1])**2)
     cl.getclusters(10)
Ejemplo n.º 11
0
 def testMultidimArray(self):
     from random import random
     data = []
     for _ in range(200):
         data.append([random(), random()])
     cl = KMeansClustering(data, lambda p0, p1: (
         p0[0] - p1[0]) ** 2 + (p0[1] - p1[1]) ** 2)
     cl.getclusters(10)
Ejemplo n.º 12
0
 def testNumpyRandom(self):
     from cluster import KMeansClustering
     from numpy import random as rnd
     data = rnd.rand(500, 2)
     cl = KMeansClustering(
         data, lambda p0, p1: (p0[0] - p1[0])**2 + (p0[1] - p1[1])**2,
         numpy.array_equal)
     cl.getclusters(10)
Ejemplo n.º 13
0
def make_cluster():
	print "Starting Clustering.."
	start_time=time.time()
	cl = KMeansClustering(INPUT_SPACE)
	clusters = cl.getclusters(NUMBER_OF_CLUSTERS)
	end_time=time.time()
	print "Clustering Done.."
	pp.pprint(clusters)
	print "total time " + str(end_time-start_time) + " secs for "+ str(len(INPUT_SPACE)) +" element"
Ejemplo n.º 14
0
 def testClustering(self):
     "Basic clustering test"
     data = [(8, 2), (7, 3), (2, 6), (3, 5), (3, 6), (1, 5), (8, 1), (3,
         4), (8, 3), (9, 2), (2, 5), (9, 3)]
     cl = KMeansClustering(data)
     self.assertEqual(
             cl.getclusters(2),
             [[(8, 2), (8, 1), (8, 3), (7, 3), (9, 2), (9, 3)],
              [(3, 5), (1, 5), (3, 4), (2, 6), (2, 5), (3, 6)]])
Ejemplo n.º 15
0
	def geo_clusters(self, days=None, start_time=None, end_time=None):
		"""Returns Clusters of Checkins for this User"""
		
		try: # get geo clustering data
			geo_data = self.geo_data()
		except ApiException:
			deauthenticate()
		
		# determine geo clusters
		kmcl = KMeansClustering(geo_data)
		clusters = kmcl.getclusters(20)
		#todo: kmeans is paralelizable, so I could use MapReduce
		#todo: move this to an ajax call
		
		# format tod groups
		areas=[]
		most=dict(
			north=None,
			south=None,
			east=None,
			west=None
		)
		for cl in clusters:
			
			lat_max=max([i[0] for i in cl])
			if not most['north'] or lat_max > most['north']:
				most['north'] = lat_max
				
			lat_min=min([i[0] for i in cl])
			if not most['south'] or lat_min < most['south']:
				most['south'] = lat_min
			
			lng_max=max([i[1] for i in cl])
			if not most['east'] or lng_max > most['east']:
				most['east'] = lng_max
				
			lng_min=min([i[1] for i in cl])
			if not most['west'] or lng_min < most['west']:
				most['west'] = lng_min
				
			areas.append(dict(
				len=len(cl),
				percent=len(geo_data)/len(cl)/100,
				opacity=float(len(geo_data))/100.0/float(len(cl)),
				avg_lat=sum([i[0] for i in cl])/len(cl),
				avg_lng=sum([i[1] for i in cl])/len(cl),
				lat_max=lat_max,
				lat_min=lat_min,
				lat_mid=lat_max-(lat_max-lat_min) / 2,
				lng_max=lng_max,
				lng_min=lng_min,
				lng_mid=lng_max-(lng_max-lng_min) / 2,
				radius=(lat_max - lat_min + lng_max - lng_min) / 2
			))
		
		areas=sorted(areas, key=lambda k: k['percent'])
		pass
Ejemplo n.º 16
0
    def testUnmodifiedData(self):
        "Basic clustering test"
        data = [(8, 2), (7, 3), (2, 6), (3, 5), (3, 6), (1, 5), (8, 1), (3, 4),
                (8, 3), (9, 2), (2, 5), (9, 3)]
        cl = KMeansClustering(data)

        new_data = []
        [new_data.extend(_) for _ in cl.getclusters(2)]
        self.assertEqual(sorted(new_data), sorted(data))
Ejemplo n.º 17
0
 def testClustering(self):
     "Basic clustering test"
     data = [(8, 2), (7, 3), (2, 6), (3, 5), (3, 6), (1, 5), (8, 1), (3, 4),
             (8, 3), (9, 2), (2, 5), (9, 3)]
     cl = KMeansClustering(data)
     self.assertEqual(
         cl.getclusters(2),
         [[(8, 2), (8, 1), (8, 3), (7, 3), (9, 2),
           (9, 3)], [(3, 5), (1, 5), (3, 4), (2, 6), (2, 5), (3, 6)]])
Ejemplo n.º 18
0
    def testUnmodifiedData(self):
        "Basic clustering test"
        data = [(8, 2), (7, 3), (2, 6), (3, 5), (3, 6), (1, 5), (8, 1), (3,
            4), (8, 3), (9, 2), (2, 5), (9, 3)]
        cl = KMeansClustering(data)

        new_data = []
        [new_data.extend(_) for _ in cl.getclusters(2)]
        self.assertEqual(sorted(new_data), sorted(data))
Ejemplo n.º 19
0
 def testLostFunctionReference(self):
     "test for bug #1727558"
     cl = KMeansClustering([(1, 1), (20, 40), (20, 41)],
             lambda x, y: x + y)
     clusters = cl.getclusters(3)
     expected = [(1, 1), (20, 40), (20, 41)]
     self.assertTrue(compare_list(
             clusters,
             expected),
             "Elements differ!\n%s\n%s" % (clusters, expected))
Ejemplo n.º 20
0
def clusterSet(traingingStart,traningEndDate,clu_num): 
    con = common.getDBConnection()
    cur = con.cursor()
    
    finalClusterRecord = []
    stockList = ["MERVAL","MEXBOL","CHILE65","BVPSBVPS","COLCAP","CRSMBCT","IBOV","IGBVL","IBVC"]
    finalOrderCluster = {}
    for stock in stockList:
        sql = "select embers_id,post_date,current_value,previous_close_value,one_day_change,change_percent,name from t_enriched_bloomberg_prices where name=? and post_date<=? and post_date>=? order by post_date asc"
        cur.execute(sql,(stock,traningEndDate,traingingStart))
        rows = cur.fetchall()
        changes = [row[5] for row in rows]
        fdist = nltk.FreqDist(changes)
        clusterS = [(0,x) for x in fdist.keys()]
        
        c1 = KMeansClustering(clusterS)
        cluster = c1.getclusters(clu_num)
        "The sample data of cluster by the KMeans algorithm"
#        cluster = [[(0, 0.0862), (0, 0.088), (0, 0.0914), (0, 0.094), (0, 0.0957), (0, 0.097), (0, 0.1017), (0, 0.1024), (0, 0.0774), (0, 0.0882), (0, 0.0783), (0, 0.11), (0, 0.0807), (0, 0.0813), (0, 0.1367), (0, 0.0831), (0, 0.0836), (0, 0.0855), (0, 0.0879), (0, 0.0912), (0, 0.0763), (0, 0.1046), (0, 0.0784), (0, 0.0815), (0, 0.1464), (0, 0.1987), (0, 0.1053), (0, 0.1101), (0, 0.1176), (0, 0.0868), (0, 0.1342), (0, 0.1466), (0, 0.0761), (0, 0.0772)], [(0, -0.0001), (0, 0.0), (0, 0.0001), (0, -0.0002), (0, -0.0003), (0, -0.0004), (0, -0.0005), (0, -0.0006), (0, 0.0002), (0, 0.0003), (0, 0.0004), (0, 0.0005), (0, 0.0006), (0, 0.0007), (0, 0.0008), (0, 0.0009), (0, 0.001), (0, 0.0011), (0, 0.0012), (0, 0.0013), (0, 0.0014), (0, 0.0015), (0, 0.0016), (0, 0.0017), (0, 0.0018), (0, 0.0019), (0, 0.002), (0, 0.0021), (0, 0.0022), (0, 0.0023), (0, 0.0024), (0, 0.0025), (0, 0.0026), (0, 0.0027), (0, 0.0028), (0, 0.0029), (0, 0.003), (0, 0.0031), (0, 0.0032), (0, 0.0033), (0, 0.0034), (0, 0.0035), (0, 0.0036), (0, 0.0037), (0, 0.0038), (0, 0.0039), (0, 0.004), (0, 0.0041), (0, 0.0042), (0, 0.0043), (0, 0.0044), (0, 0.0045), (0, 0.0046), (0, 0.0047), (0, 0.0048), (0, 0.0049), (0, 0.005), (0, -0.0007), (0, -0.0008)], [(0, 0.0297), (0, 0.0296), (0, 0.0298), (0, 0.0299), (0, 0.0301), (0, 0.03), (0, 0.0303), (0, 0.0302), (0, 0.0304), (0, 0.0305), (0, 0.0306), (0, 0.0308), (0, 0.0307), (0, 0.0309), (0, 0.031), (0, 0.0311), (0, 0.0313), (0, 0.0314), (0, 0.0312), (0, 0.0316), (0, 0.0315), (0, 0.0317), (0, 0.0318), (0, 0.032), (0, 0.0319), (0, 0.0322), (0, 0.0321), (0, 0.0324), (0, 0.0323), (0, 0.0326), (0, 0.0325), (0, 0.0328), (0, 0.033), (0, 0.0327), (0, 0.0332), (0, 0.0331), (0, 0.0333), (0, 0.0329), (0, 0.0335), (0, 0.0336), (0, 0.0334), (0, 0.0337), (0, 0.0338), (0, 0.0339), (0, 0.034), (0, 0.0341), (0, 0.0342), (0, 0.0343), (0, 0.0344), (0, 0.0345), (0, 0.0346), (0, 0.0348), (0, 0.0349), (0, 0.035), (0, 0.0351), (0, 0.0352), (0, 0.0355), (0, 0.0356), (0, 0.0358), (0, 0.0357), (0, 0.0359), (0, 0.036), (0, 0.0361), (0, 0.0362), (0, 0.0363), (0, 0.0365)], [(0, 0.0559), (0, 0.0564), (0, 0.0568), (0, 0.0571), (0, 0.0573), (0, 0.0579), (0, 0.0578), (0, 0.0581), (0, 0.0587), (0, 0.0589), (0, 0.0595), (0, 0.0591), (0, 0.0594), (0, 0.0604), (0, 0.0598), (0, 0.06), (0, 0.0602), (0, 0.0609), (0, 0.0612), (0, 0.059), (0, 0.0606), (0, 0.0614), (0, 0.0619), (0, 0.0625), (0, 0.0628), (0, 0.0615), (0, 0.0637), (0, 0.0633), (0, 0.0634), (0, 0.0636), (0, 0.0654), (0, 0.0658), (0, 0.0659), (0, 0.0669), (0, 0.0667), (0, 0.0664), (0, 0.067), (0, 0.0675), (0, 0.0673), (0, 0.0676), (0, 0.0686), (0, 0.07), (0, 0.0697), (0, 0.0709), (0, 0.0716), (0, 0.0717), (0, 0.0738), (0, 0.0747)], [(0, -0.0133), (0, -0.0132), (0, -0.0135), (0, -0.0134), (0, -0.0137), (0, -0.0138), (0, -0.0136), (0, -0.014), (0, -0.0139), (0, -0.0142), (0, -0.0143), (0, -0.0144), (0, -0.0141), (0, -0.0145), (0, -0.0146), (0, -0.0147), (0, -0.0148), (0, -0.0149), (0, -0.015), (0, -0.0151), (0, -0.0152), (0, -0.0153), (0, -0.0154), (0, -0.0155), (0, -0.0156), (0, -0.0157), (0, -0.0158), (0, -0.0159), (0, -0.016), (0, -0.0161), (0, -0.0162), (0, -0.0163), (0, -0.0164), (0, -0.0165), (0, -0.0166), (0, -0.0167), (0, -0.0168), (0, -0.0169), (0, -0.017), (0, -0.0171), (0, -0.0172), (0, -0.0173), (0, -0.0174), (0, -0.0175), (0, -0.0176), (0, -0.0177), (0, -0.0178), (0, -0.0179), (0, -0.018), (0, -0.0181), (0, -0.0182), (0, -0.0183), (0, -0.0184), (0, -0.0185), (0, -0.0186), (0, -0.0187), (0, -0.0188), (0, -0.0189), (0, -0.019), (0, -0.0191), (0, -0.0192), (0, -0.0193), (0, -0.0194), (0, -0.0195)], [(0, 0.0448), (0, 0.0451), (0, 0.0452), (0, 0.0446), (0, 0.0447), (0, 0.0456), (0, 0.045), (0, 0.0455), (0, 0.0462), (0, 0.0459), (0, 0.0461), (0, 0.0466), (0, 0.046), (0, 0.0467), (0, 0.0445), (0, 0.0458), (0, 0.0464), (0, 0.0477), (0, 0.0463), (0, 0.0472), (0, 0.0478), (0, 0.0457), (0, 0.0476), (0, 0.0481), (0, 0.0484), (0, 0.0488), (0, 0.0483), (0, 0.0487), (0, 0.0471), (0, 0.0482), (0, 0.0496), (0, 0.0474), (0, 0.0495), (0, 0.0485), (0, 0.0504), (0, 0.0505), (0, 0.0506), (0, 0.0501), (0, 0.0509), (0, 0.0508), (0, 0.051), (0, 0.0515), (0, 0.0516), (0, 0.052), (0, 0.0522), (0, 0.0524), (0, 0.053), (0, 0.0531), (0, 0.0534), (0, 0.0535), (0, 0.0536), (0, 0.0537), (0, 0.0538), (0, 0.0541), (0, 0.0542), (0, 0.0545), (0, 0.0546), (0, 0.0548), (0, 0.055)], [(0, 0.0172), (0, 0.017), (0, 0.0173), (0, 0.0174), (0, 0.0171), (0, 0.0177), (0, 0.0175), (0, 0.0178), (0, 0.0179), (0, 0.0176), (0, 0.0181), (0, 0.018), (0, 0.0183), (0, 0.0182), (0, 0.0186), (0, 0.0185), (0, 0.0187), (0, 0.0184), (0, 0.0189), (0, 0.0188), (0, 0.019), (0, 0.0191), (0, 0.0192), (0, 0.0194), (0, 0.0193), (0, 0.0196), (0, 0.0195), (0, 0.0197), (0, 0.0199), (0, 0.0198), (0, 0.02), (0, 0.0201), (0, 0.0202), (0, 0.0204), (0, 0.0205), (0, 0.0206), (0, 0.0203), (0, 0.0208), (0, 0.0207), (0, 0.021), (0, 0.0209), (0, 0.0211), (0, 0.0213), (0, 0.0212), (0, 0.0214), (0, 0.0215), (0, 0.0216), (0, 0.0217), (0, 0.0218), (0, 0.0219), (0, 0.022), (0, 0.0221), (0, 0.0222), (0, 0.0223), (0, 0.0224), (0, 0.0225), (0, 0.0226), (0, 0.0227), (0, 0.0228), (0, 0.0229), (0, 0.023), (0, 0.0231)], [(0, -0.0408), (0, -0.041), (0, -0.0411), (0, -0.0412), (0, -0.0413), (0, -0.0415), (0, -0.0416), (0, -0.0417), (0, -0.0419), (0, -0.042), (0, -0.0423), (0, -0.0424), (0, -0.0418), (0, -0.0425), (0, -0.0428), (0, -0.043), (0, -0.0431), (0, -0.0432), (0, -0.0433), (0, -0.0434), (0, -0.0436), (0, -0.0438), (0, -0.0439), (0, -0.044), (0, -0.0442), (0, -0.0441), (0, -0.0446), (0, -0.0443), (0, -0.0448), (0, -0.0447), (0, -0.045), (0, -0.0449), (0, -0.0453), (0, -0.0451), (0, -0.0454), (0, -0.0455), (0, -0.0458), (0, -0.0456), (0, -0.0459), (0, -0.0463), (0, -0.0461), (0, -0.046), (0, -0.0464), (0, -0.0465), (0, -0.0467), (0, -0.0462), (0, -0.0466), (0, -0.0472), (0, -0.0469), (0, -0.0475), (0, -0.0473), (0, -0.0478), (0, -0.0477), (0, -0.0476), (0, -0.0482), (0, -0.0481), (0, -0.0483), (0, -0.0487), (0, -0.0488), (0, -0.049), (0, -0.0492), (0, -0.0494)], [(0, -0.0261), (0, -0.0262), (0, -0.0263), (0, -0.0264), (0, -0.0266), (0, -0.0265), (0, -0.0267), (0, -0.0268), (0, -0.0269), (0, -0.0271), (0, -0.027), (0, -0.0273), (0, -0.0272), (0, -0.0275), (0, -0.0274), (0, -0.0277), (0, -0.0278), (0, -0.0276), (0, -0.0279), (0, -0.0281), (0, -0.028), (0, -0.0283), (0, -0.0282), (0, -0.0284), (0, -0.0285), (0, -0.0286), (0, -0.0287), (0, -0.0288), (0, -0.0289), (0, -0.0291), (0, -0.0292), (0, -0.0293), (0, -0.029), (0, -0.0294), (0, -0.0295), (0, -0.0297), (0, -0.0296), (0, -0.0299), (0, -0.03), (0, -0.0301), (0, -0.0302), (0, -0.0298), (0, -0.0303), (0, -0.0304), (0, -0.0307), (0, -0.0305), (0, -0.0308), (0, -0.031), (0, -0.0309), (0, -0.0312), (0, -0.0311), (0, -0.0313), (0, -0.0315), (0, -0.0314), (0, -0.0316), (0, -0.0317), (0, -0.0319), (0, -0.0318), (0, -0.032), (0, -0.0321), (0, -0.0322), (0, -0.0323), (0, -0.0325), (0, -0.0326), (0, -0.0327), (0, -0.0328), (0, -0.0329)], [(0, -0.0619), (0, -0.0622), (0, -0.0627), (0, -0.064), (0, -0.0645), (0, -0.065), (0, -0.0653), (0, -0.0651), (0, -0.0659), (0, -0.0663), (0, -0.0665), (0, -0.066), (0, -0.0666), (0, -0.0674), (0, -0.0671), (0, -0.0684), (0, -0.0672), (0, -0.0691), (0, -0.0689), (0, -0.0692), (0, -0.0701), (0, -0.0698), (0, -0.0709), (0, -0.0715), (0, -0.0717), (0, -0.0722), (0, -0.0734), (0, -0.0741), (0, -0.0749), (0, -0.0763), (0, -0.0772), (0, -0.0758), (0, -0.0762), (0, -0.0787), (0, -0.0788), (0, -0.0759), (0, -0.0775), (0, -0.0808)], [(0, -0.0905), (0, -0.1081), (0, -0.1018), (0, -0.094), (0, -0.0937), (0, -0.0936), (0, -0.0927), (0, -0.0919), (0, -0.0863), (0, -0.1593), (0, -0.1245), (0, -0.0847), (0, -0.1215), (0, -0.1139), (0, -0.1099), (0, -0.1068), (0, -0.0868), (0, -0.0856), (0, -0.0854), (0, -0.0837), (0, -0.0822), (0, -0.0877), (0, -0.1241), (0, -0.1073), (0, -0.1065), (0, -0.1011), (0, -0.0835)], [(0, -0.0196), (0, -0.0198), (0, -0.0197), (0, -0.0199), (0, -0.02), (0, -0.0201), (0, -0.0202), (0, -0.0204), (0, -0.0203), (0, -0.0205), (0, -0.0206), (0, -0.0208), (0, -0.0207), (0, -0.021), (0, -0.0209), (0, -0.0212), (0, -0.0211), (0, -0.0214), (0, -0.0215), (0, -0.0213), (0, -0.0217), (0, -0.0216), (0, -0.0219), (0, -0.0218), (0, -0.0221), (0, -0.022), (0, -0.0223), (0, -0.0222), (0, -0.0225), (0, -0.0224), (0, -0.0227), (0, -0.0226), (0, -0.0229), (0, -0.0228), (0, -0.023), (0, -0.0231), (0, -0.0232), (0, -0.0234), (0, -0.0233), (0, -0.0236), (0, -0.0235), (0, -0.0238), (0, -0.0237), (0, -0.024), (0, -0.0239), (0, -0.0242), (0, -0.0241), (0, -0.0244), (0, -0.0243), (0, -0.0245), (0, -0.0246), (0, -0.0247), (0, -0.0248), (0, -0.0249), (0, -0.025), (0, -0.0251), (0, -0.0252), (0, -0.0253), (0, -0.0254), (0, -0.0255), (0, -0.0256), (0, -0.0257), (0, -0.0258), (0, -0.0259), (0, -0.026)], [(0, -0.05), (0, -0.0504), (0, -0.0499), (0, -0.0507), (0, -0.0501), (0, -0.0509), (0, -0.0513), (0, -0.0505), (0, -0.051), (0, -0.0508), (0, -0.0517), (0, -0.0519), (0, -0.0516), (0, -0.052), (0, -0.0524), (0, -0.0525), (0, -0.0526), (0, -0.0528), (0, -0.0529), (0, -0.0533), (0, -0.0538), (0, -0.0535), (0, -0.0532), (0, -0.0542), (0, -0.0543), (0, -0.0546), (0, -0.054), (0, -0.055), (0, -0.0556), (0, -0.0545), (0, -0.056), (0, -0.0554), (0, -0.0567), (0, -0.0563), (0, -0.0571), (0, -0.0572), (0, -0.0576), (0, -0.0579), (0, -0.058), (0, -0.0584), (0, -0.0581), (0, -0.0588), (0, -0.0589), (0, -0.0591), (0, -0.0593), (0, -0.0596), (0, -0.0595), (0, -0.0601), (0, -0.0613), (0, -0.0614)], [(0, -0.001), (0, -0.0012), (0, -0.0017), (0, -0.0016), (0, -0.0013), (0, -0.0011), (0, -0.002), (0, -0.0018), (0, -0.0015), (0, -0.0014), (0, -0.0019), (0, -0.0021), (0, -0.0022), (0, -0.0023), (0, -0.0009), (0, -0.0024), (0, -0.0025), (0, -0.0026), (0, -0.0027), (0, -0.0028), (0, -0.0029), (0, -0.003), (0, -0.0031), (0, -0.0032), (0, -0.0033), (0, -0.0034), (0, -0.0035), (0, -0.0036), (0, -0.0037), (0, -0.0038), (0, -0.0039), (0, -0.004), (0, -0.0041), (0, -0.0042), (0, -0.0043), (0, -0.0044), (0, -0.0045), (0, -0.0046), (0, -0.0047), (0, -0.0048), (0, -0.0049), (0, -0.005), (0, -0.0051), (0, -0.0052), (0, -0.0053), (0, -0.0054), (0, -0.0055), (0, -0.0056), (0, -0.0057), (0, -0.0058), (0, -0.0059), (0, -0.006), (0, -0.0061), (0, -0.0062), (0, -0.0063), (0, -0.0064), (0, -0.0065), (0, -0.0066), (0, -0.0067), (0, -0.0068), (0, -0.0069)], [(0, -0.033), (0, -0.0332), (0, -0.0331), (0, -0.0334), (0, -0.0333), (0, -0.0336), (0, -0.0337), (0, -0.0335), (0, -0.0338), (0, -0.034), (0, -0.0339), (0, -0.0342), (0, -0.0343), (0, -0.0341), (0, -0.0344), (0, -0.0345), (0, -0.0346), (0, -0.0347), (0, -0.0348), (0, -0.035), (0, -0.0349), (0, -0.0351), (0, -0.0352), (0, -0.0353), (0, -0.0354), (0, -0.0355), (0, -0.0357), (0, -0.0356), (0, -0.0358), (0, -0.0359), (0, -0.0361), (0, -0.036), (0, -0.0363), (0, -0.0362), (0, -0.0365), (0, -0.0366), (0, -0.0364), (0, -0.0368), (0, -0.0369), (0, -0.0372), (0, -0.0371), (0, -0.0367), (0, -0.0375), (0, -0.0373), (0, -0.0376), (0, -0.0374), (0, -0.0378), (0, -0.038), (0, -0.0379), (0, -0.0377), (0, -0.0382), (0, -0.0384), (0, -0.0383), (0, -0.0386), (0, -0.0381), (0, -0.0387), (0, -0.0389), (0, -0.0385), (0, -0.039), (0, -0.0391), (0, -0.0388), (0, -0.0392), (0, -0.0395), (0, -0.0393), (0, -0.0397), (0, -0.0398), (0, -0.0396), (0, -0.0399), (0, -0.0402), (0, -0.0401), (0, -0.0403), (0, -0.0406), (0, -0.0407)], [(0, 0.0232), (0, 0.0233), (0, 0.0234), (0, 0.0235), (0, 0.0237), (0, 0.0236), (0, 0.0238), (0, 0.0239), (0, 0.024), (0, 0.0241), (0, 0.0242), (0, 0.0243), (0, 0.0244), (0, 0.0245), (0, 0.0247), (0, 0.0248), (0, 0.0246), (0, 0.0249), (0, 0.025), (0, 0.0251), (0, 0.0253), (0, 0.0252), (0, 0.0255), (0, 0.0254), (0, 0.0257), (0, 0.0256), (0, 0.0259), (0, 0.026), (0, 0.0258), (0, 0.0261), (0, 0.0262), (0, 0.0264), (0, 0.0265), (0, 0.0263), (0, 0.0267), (0, 0.0268), (0, 0.0266), (0, 0.027), (0, 0.0269), (0, 0.0271), (0, 0.0272), (0, 0.0274), (0, 0.0273), (0, 0.0276), (0, 0.0275), (0, 0.0277), (0, 0.0278), (0, 0.0279), (0, 0.0281), (0, 0.0282), (0, 0.0283), (0, 0.0284), (0, 0.0285), (0, 0.0286), (0, 0.0287), (0, 0.0288), (0, 0.0289), (0, 0.029), (0, 0.0291), (0, 0.0292), (0, 0.0293), (0, 0.0294)], [(0, 0.011), (0, 0.0112), (0, 0.0113), (0, 0.0111), (0, 0.0115), (0, 0.0114), (0, 0.0117), (0, 0.0116), (0, 0.0118), (0, 0.0119), (0, 0.0121), (0, 0.0122), (0, 0.0123), (0, 0.0124), (0, 0.012), (0, 0.0126), (0, 0.0125), (0, 0.0128), (0, 0.0127), (0, 0.013), (0, 0.0129), (0, 0.0131), (0, 0.0133), (0, 0.0132), (0, 0.0135), (0, 0.0134), (0, 0.0136), (0, 0.0137), (0, 0.0138), (0, 0.014), (0, 0.0139), (0, 0.0142), (0, 0.0141), (0, 0.0143), (0, 0.0144), (0, 0.0145), (0, 0.0146), (0, 0.0147), (0, 0.0148), (0, 0.0149), (0, 0.015), (0, 0.0151), (0, 0.0153), (0, 0.0152), (0, 0.0154), (0, 0.0155), (0, 0.0156), (0, 0.0157), (0, 0.0158), (0, 0.0159), (0, 0.016), (0, 0.0161), (0, 0.0162), (0, 0.0163), (0, 0.0164), (0, 0.0165), (0, 0.0166), (0, 0.0167), (0, 0.0168), (0, 0.0169)], [(0, -0.007), (0, -0.0071), (0, -0.0072), (0, -0.0073), (0, -0.0074), (0, -0.0075), (0, -0.0076), (0, -0.0077), (0, -0.0078), (0, -0.0079), (0, -0.0081), (0, -0.008), (0, -0.0082), (0, -0.0083), (0, -0.0084), (0, -0.0085), (0, -0.0086), (0, -0.0087), (0, -0.0088), (0, -0.0089), (0, -0.009), (0, -0.0091), (0, -0.0092), (0, -0.0093), (0, -0.0094), (0, -0.0095), (0, -0.0096), (0, -0.0097), (0, -0.0098), (0, -0.0099), (0, -0.01), (0, -0.0101), (0, -0.0102), (0, -0.0103), (0, -0.0104), (0, -0.0105), (0, -0.0106), (0, -0.0107), (0, -0.0108), (0, -0.0109), (0, -0.011), (0, -0.0111), (0, -0.0112), (0, -0.0113), (0, -0.0114), (0, -0.0115), (0, -0.0116), (0, -0.0117), (0, -0.0118), (0, -0.0119), (0, -0.012), (0, -0.0121), (0, -0.0122), (0, -0.0123), (0, -0.0124), (0, -0.0125), (0, -0.0126), (0, -0.0127), (0, -0.0128), (0, -0.0129), (0, -0.013), (0, -0.0131)], [(0, 0.0051), (0, 0.0052), (0, 0.0053), (0, 0.0055), (0, 0.0054), (0, 0.0057), (0, 0.0056), (0, 0.0059), (0, 0.0058), (0, 0.0061), (0, 0.006), (0, 0.0062), (0, 0.0063), (0, 0.0064), (0, 0.0065), (0, 0.0066), (0, 0.0068), (0, 0.0069), (0, 0.0067), (0, 0.007), (0, 0.0072), (0, 0.0071), (0, 0.0073), (0, 0.0074), (0, 0.0075), (0, 0.0076), (0, 0.0077), (0, 0.0078), (0, 0.0079), (0, 0.008), (0, 0.0081), (0, 0.0082), (0, 0.0083), (0, 0.0084), (0, 0.0085), (0, 0.0086), (0, 0.0087), (0, 0.0088), (0, 0.0089), (0, 0.009), (0, 0.0091), (0, 0.0092), (0, 0.0093), (0, 0.0094), (0, 0.0095), (0, 0.0096), (0, 0.0097), (0, 0.0098), (0, 0.0099), (0, 0.01), (0, 0.0101), (0, 0.0102), (0, 0.0103), (0, 0.0104), (0, 0.0105), (0, 0.0106), (0, 0.0107), (0, 0.0108), (0, 0.0109)], [(0, 0.0369), (0, 0.0371), (0, 0.0367), (0, 0.037), (0, 0.0375), (0, 0.0373), (0, 0.0376), (0, 0.0372), (0, 0.0377), (0, 0.038), (0, 0.0379), (0, 0.0374), (0, 0.0381), (0, 0.0382), (0, 0.0378), (0, 0.0384), (0, 0.0386), (0, 0.0387), (0, 0.0385), (0, 0.0389), (0, 0.0391), (0, 0.039), (0, 0.0392), (0, 0.0394), (0, 0.0395), (0, 0.0396), (0, 0.0398), (0, 0.0399), (0, 0.04), (0, 0.0401), (0, 0.0404), (0, 0.0405), (0, 0.0406), (0, 0.0407), (0, 0.0408), (0, 0.0409), (0, 0.041), (0, 0.0411), (0, 0.0412), (0, 0.0414), (0, 0.0415), (0, 0.0416), (0, 0.0417), (0, 0.0419), (0, 0.042), (0, 0.0421), (0, 0.0422), (0, 0.0426), (0, 0.0428), (0, 0.0427), (0, 0.043), (0, 0.0429), (0, 0.0431), (0, 0.0433), (0, 0.0434), (0, 0.0435), (0, 0.0436), (0, 0.0438), (0, 0.0437), (0, 0.044), (0, 0.0442), (0, 0.0444)]]
        namedCluster = {}
        i = 0
        orderCluster = {}
        for clu in cluster:
            i = i + 1
            namedCluster[i] = clu
            orderCluster[i] = [min(clu)[1],max(clu)[1]] 
        
        "The number of rows to be committed for each interval"
        committedInterval=0
        for row in rows:
            for nc in namedCluster:
                if (0,row[5]) in namedCluster[nc]:
                    newRow = list(row)
                    newRow.append(nc)
                    "update the trend type into Database"
                    UpdateEnrichedData(con, committedInterval, newRow)
                    finalClusterRecord.append(newRow)
        con.commit() 
        finalOrderCluster[stock] = orderCluster
        print stock, " Done"
        
    "Write the type range into a file"
    trendRangeFile = common.get_configuration("model", "TREND_RANGE_FILE")
    dataStr = json.dumps(finalOrderCluster)
    with open(trendRangeFile,"w") as output:
        output.write(dataStr)
    
    "Write the training data into file"
    trendSetRecordFile = common.get_configuration("training", "TRAINING_TREND_RECORDS")
    dataStr = json.dumps(finalClusterRecord)
    with open(trendSetRecordFile,"w") as output:
        output.write(dataStr)
    
    if con:
        con.close()
Ejemplo n.º 21
0
def get_data(inf='ders1.txt', nclusters=3):
    dct = {}
    data = []
    infile = open(inf)
    for line in infile:
        root, vals = eval(line)
        dct[vals] = root
        data.append(vals)
    infile.close()
    cl = KMeansClustering(data)
    return cl.getclusters(nclusters), dct
Ejemplo n.º 22
0
 def testPointDoubling(self):
     "test for bug #1604868"
     data = [(18, 13), (15, 12), (17, 12), (18, 12), (19, 12), (16, 11),
             (18, 11), (19, 10), (0, 0), (1, 4), (1, 2), (2, 3), (4, 1),
             (4, 3), (5, 2), (6, 1)]
     cl = KMeansClustering(data)
     clusters = cl.getclusters(2)
     expected = [[(18, 13), (15, 12), (17, 12), (18, 12), (19, 12),
                  (16, 11), (18, 11), (19, 10)],
                 [(0, 0), (1, 4), (1, 2), (2, 3), (4, 1), (5, 2), (6, 1),
                  (4, 3)]]
     self.assertTrue(compare_list(clusters, expected),
                     "Elements differ!\n%s\n%s" % (clusters, expected))
Ejemplo n.º 23
0
def doClusterByWords(k, question):
    clusteredIdeas = []
    try:
        vectors, texts, phrases, ids = computeBagsOfWords(question)
        cl = KMeansClustering(vectors)
        clusterData = cl.getclusters(k)
        clusters = clusterData["clusters"]
        ideaIndices = clusterData["indices"]

        # Delete existing clusters from database (including those created by other algorithms)
        Cluster.deleteAllClusters(question)

        clusterNum = 0
        for cluster in clusters:
            clusterObj = Cluster.createCluster(
                "Cluster #" + str(clusterNum + 1), clusterNum, question, CLUSTER_BY_WORDS
            )
            entry = []
            ideas = []
            if type(cluster) is tuple:
                # Cluster may only have a single tuple instead of a collection of them
                index = ideaIndices[clusterNum][0]
                text = texts[index]
                phrase = phrases[index]
                idea_id = ids[index]
                idea = Idea.assignCluster(idea_id, clusterObj)
                ideas.append(idea.toDict())
            else:
                j = 0
                for vector in cluster:
                    index = ideaIndices[clusterNum][j]
                    text = texts[index]
                    phrase = phrases[index]
                    idea_id = ids[index]
                    entry.append([text, phrase])
                    idea = Idea.assignCluster(idea_id, clusterObj)
                    ideas.append(idea.toDict())
                    j += 1
            clusteredIdeas.append({"name": clusterObj.text, "ideas": ideas})
            clusterNum += 1

        # Clean up any existing tags and cluster assignments since clusters have been reformed
        ClusterTag.deleteAllTags(question)
        ClusterAssignment.deleteAllClusterAssignments(question)

    except:
        clusteredIdeas = []
        raise

    return clusteredIdeas
Ejemplo n.º 24
0
 def testPointDoubling(self):
     "test for bug #1604868"
     data = [(18, 13), (15, 12), (17, 12), (18, 12), (19, 12), (16, 11),
             (18, 11), (19, 10), (0, 0), (1, 4), (1, 2), (2, 3), (4, 1),
             (4, 3), (5, 2), (6, 1)]
     cl = KMeansClustering(data)
     clusters = cl.getclusters(2)
     expected = [[(18, 13), (15, 12), (17, 12), (18, 12), (19, 12), (16,
         11), (18, 11), (19, 10)], [(0, 0), (1, 4), (1, 2), (2, 3), (4, 1),
             (5, 2), (6, 1), (4, 3)]]
     self.assertTrue(compare_list(
             clusters,
             expected),
             "Elements differ!\n%s\n%s" % (clusters, expected))
Ejemplo n.º 25
0
 def Kmeanscluster(self, frontiers, no_robots):
     from cluster import KMeansClustering
     self.freeclusters = []
     cl = KMeansClustering(frontiers)
     clusters = cl.getclusters(no_robots)
     #print clusters
     centroids = []
     for i in range(no_robots):
         gen = centroidnp(len(clusters[i]), clusters[i])
         centroids.append(gen)
         genagain = Cluster.Cluster(centroids[i][0], centroids[i][1])
         genagain.occupied = False
         self.freeclusters.append(genagain)
     return self.freeclusters, clusters
Ejemplo n.º 26
0
 def testNonsenseCluster(self):
     """
     Test that asking for more clusters than data-items available raises an
     error
     """
     cl = KMeansClustering([876, 123], distance=lambda x, y: abs(x - y))
     self.assertRaises(ClusteringError, cl.getclusters, 5)
Ejemplo n.º 27
0
    def get_clusters(self, input_list, number_of_clusters=3):

        # 1.) Reshape for KMeans
        input_list = [(0, x) for x in input_list]

        # 2.) Magic
        clusterer = KMeansClustering(input_list)
        clusters = clusterer.getclusters(number_of_clusters)

        # 3.) Reshape Back to Normal and Sort Highest To Lowest
        temp = []
        for index, cluster in enumerate(clusters):
            x1 = [x[1] for x in cluster]
            temp.append(sorted(x1, reverse=True))

        clusters = sorted(temp, reverse=True, key=lambda x: x[0])
        return clusters
Ejemplo n.º 28
0
    def Kmeanscluster(self, frontiers, no_robots):

        from cluster import KMeansClustering
        self.freeclusters = []
        # Perform clustering
        cl = KMeansClustering(frontiers)
        clusters = cl.getclusters(no_robots)

        # Compute centroids
        centroids = []
        for i in range(no_robots):
            # Cheap hack (If the algorithm returns only a tuple convert it into a list,
            # because centroidnp takes as input only a list)
            if type(clusters[i]) is tuple:
                clusters[i] = [clusters[i]]
            temp = centroidnp(clusters[i])
            centroids.append(temp)
            # Store each centroid in its corresponding Cluster object
            clusterObject = Cluster.Cluster(centroids[i][0], centroids[i][1])
            clusterObject.occupied = False
            self.freeclusters.append(clusterObject)

        return self.freeclusters, clusters
Ejemplo n.º 29
0
def cluster_trajectories():
    json_data = open('static/datasets/dj-mag-top-100.json')
    data = json.load(json_data)
    uniques_djs = set()

    for year in range(1997, 2014):
        for name in data[str(year)]:
            uniques_djs.add(name)

    dj_vectors = []
    dj_vector_map = {}

    for dj in uniques_djs:
        trajectory = ()
        for year in range(1997, 2014):
            if dj in data[str(year)]:
                trajectory += (data[str(year)].index(dj),)
            else:
                trajectory += (-999,)
        dj_vectors.append(trajectory)
        dj_vector_map[trajectory] = dj

    cl = KMeansClustering(dj_vectors)
    clusters = cl.getclusters(10)

    dj_clusters = []
    for cluster in clusters:
        dj_group = []
        for vector in cluster:
            dj_group.append(dj_vector_map[vector])
        dj_clusters.append(dj_group)

    print json.dumps(dj_clusters)

    #Close file stream
    json_data.close()
Ejemplo n.º 30
0
 def testClusterLen0(self):
     "Testing if clustering an empty set, returns an empty set"
     cl = KMeansClustering([])
     self.assertEqual([], cl.getclusters(2))
     self.assertEqual([], cl.getclusters(7))
Ejemplo n.º 31
0
 def testClusterLen1(self):
     "Testing that a search space of length 1 returns only one cluster"
     cl = KMeansClustering([876])
     self.assertEqual([876], cl.getclusters(2))
     self.assertEqual([876], cl.getclusters(5))
Ejemplo n.º 32
0
 def testNumpyRandom(self):
     data = numpy.random.rand(500, 2)
     cl = KMeansClustering(data, lambda p0, p1: (
         p0[0] - p1[0]) ** 2 + (p0[1] - p1[1]) ** 2, numpy.array_equal)
     cl.getclusters(10)
Ejemplo n.º 33
0
 def testClusterLen0(self):
     "Testing if clustering an empty set, returns an empty set"
     cl = KMeansClustering([])
     self.assertEqual([], cl.getclusters(2))
     self.assertEqual([], cl.getclusters(7))
    ((lat, lon), f) = coords_freqs[label]
    expanded_coords.append((label, [(lon, lat)] * f))  # Flip lat/lon for Google Earth

# No need to clutter the map with unnecessary placemarks...

kml_items = [{'label': label, 'coords': '%s,%s' % coords[0]} for (label,
             coords) in expanded_coords]

# It could also be interesting to include names of your contacts on the map for display

for item in kml_items:
    item['contacts'] = '\n'.join(['%s %s.' % (ec.first_name, ec.last_name[0])
                                 for ec in extended_connections if ec.location
                                 == item['label']])

cl = KMeansClustering([coords for (label, coords_list) in expanded_coords
                      for coords in coords_list])

centroids = [{'label': 'CENTROID', 'coords': '%s,%s' % centroid(c)} for c in
             cl.getclusters(K)]

kml_items.extend(centroids)
kml = createKML(kml_items)

if not os.path.isdir('out'):
    os.mkdir('out')

f = open("out/" + OUT, 'w')
f.write(kml)
f.close()

print >> sys.stderr, 'Data pickled to out/' + OUT 
Ejemplo n.º 35
0
def doClusterBySimilarity(k, question, includeUnclustered=False):
    clusteredIdeas = []
    similarityDict = createSimilarityDict(question)
    if similarityDict:
        # create array of tuples containing similarity counts for each item pair
        # (e.g., # of users who said item1 and item2 were the same)
        countVectors = []
        rowIds = []
        sortedKeys = sorted(similarityDict.iterkeys())
        for idea1_key in sortedKeys:
            rowCounts = []
            for idea2_key in sortedKeys:
                # if same idea, value is 1 (e.g., idea1_key == idea2_key)
                # if idea1 and idea2 were never marked as similar, value is 0
                # otherwise, value is # of users who marked idea pair as similar
                # TODO: for k-means clustering, what value should be used when idea1_key == idea2_key
                count = (
                    similarityDict[idea1_key]["counts"][idea2_key]
                    if idea2_key in similarityDict[idea1_key]["counts"]
                    else (1 if idea1_key == idea2_key else 0)
                )
                count = 1 if count > 0 else 0
                rowCounts.append(count)

            rowIds.append(similarityDict[idea1_key]["idea"]["id"])
            countVectors.append(tuple(rowCounts))

        # FOR DEBUGGING: print count vectors
        #         row = 0
        #         for idea_key in similarityDict:
        #             idea = similarityDict[idea_key]["idea"]
        #             helpers.log("row={0},{1}:\t\t{2}".format(row, idea["text"], countVectors[row]))
        #             row += 1

        try:
            cl = KMeansClustering(countVectors)
            clusterData = cl.getclusters(k)
            clusters = clusterData["clusters"]
            ideaIndices = clusterData["indices"]

            # Delete existing clusters from database (including those created by other algorithms)
            Cluster.deleteAllClusters(question)

            clusterNum = 0
            for cluster in clusters:
                clusterObj = Cluster.createCluster(
                    "Cluster #" + str(clusterNum + 1), clusterNum, question, CLUSTER_BY_SIMILARITY
                )
                ideas = []
                i = 0
                for vector in cluster:
                    idea_index = ideaIndices[clusterNum][i]
                    idea_id = rowIds[idea_index]
                    # idea = similarityDict[idea_key]["idea"]
                    idea = Idea.assignCluster(idea_id, clusterObj)
                    ideas.append(idea.toDict())
                    i += 1

                clusteredIdeas.append({"name": clusterObj.text, "ideas": ideas})
                clusterNum += 1

            # Clean up any existing tags and cluster assignments since clusters have been reformed
            ClusterTag.deleteAllTags(question)
            ClusterAssignment.deleteAllClusterAssignments(question)

        except ClusteringError:
            clusteredIdeas = []
            raise

    # TODO: need more efficient way to get unclustered ideas
    unclusteredIdeas = []
    if includeUnclustered:
        compared = {}
        for idea_key in similarityDict:
            compared[idea_key] = similarityDict[idea_key]["idea"]

        for idea in Idea.all().filter("question =", question):
            idea_key = str(idea.key().id())
            if idea_key not in compared:
                unclusteredIdeas.append(idea.toDict())

        if len(unclusteredIdeas) > 0:
            clusteredIdeas.append({"name": "Unclustered", "ideas": unclusteredIdeas})

    return clusteredIdeas
Ejemplo n.º 36
0
def clusterSet(traningEndDate):
    con = common.getDBConnection()
    cur = con.cursor()

    finalClusterRecord = []
    stockList = [
        "MERVAL", "MEXBOL", "CHILE65", "BVPSBVPS", "COLCAP", "CRSMBCT", "IBOV",
        "IGBVL"
    ]
    finalOrderCluster = {}
    for stock in stockList:
        sql = "select embers_id,sub_sequence,date,last_price,one_day_change,round(one_day_change/(last_price-one_day_change),4),stock_index from t_daily_stockindex where stock_index=? and date<=?"
        cur.execute(sql, (stock, traningEndDate))
        rows = cur.fetchall()
        changes = [row[5] for row in rows]
        fdist = nltk.FreqDist(changes)
        clusterS = [(0, x) for x in fdist.keys()]

        print "StartTime: ", datetime.strftime(datetime.now(),
                                               "%Y-%m-%d %H:%M:%S")
        c1 = KMeansClustering(clusterS)
        print "MiddleTime: ", datetime.strftime(datetime.now(),
                                                "%Y-%m-%d %H:%M:%S")
        cluster = c1.getclusters(20)
        #        cluster = [[(0, 0.0862), (0, 0.088), (0, 0.0914), (0, 0.094), (0, 0.0957), (0, 0.097), (0, 0.1017), (0, 0.1024), (0, 0.0774), (0, 0.0882), (0, 0.0783), (0, 0.11), (0, 0.0807), (0, 0.0813), (0, 0.1367), (0, 0.0831), (0, 0.0836), (0, 0.0855), (0, 0.0879), (0, 0.0912), (0, 0.0763), (0, 0.1046), (0, 0.0784), (0, 0.0815), (0, 0.1464), (0, 0.1987), (0, 0.1053), (0, 0.1101), (0, 0.1176), (0, 0.0868), (0, 0.1342), (0, 0.1466), (0, 0.0761), (0, 0.0772)], [(0, -0.0001), (0, 0.0), (0, 0.0001), (0, -0.0002), (0, -0.0003), (0, -0.0004), (0, -0.0005), (0, -0.0006), (0, 0.0002), (0, 0.0003), (0, 0.0004), (0, 0.0005), (0, 0.0006), (0, 0.0007), (0, 0.0008), (0, 0.0009), (0, 0.001), (0, 0.0011), (0, 0.0012), (0, 0.0013), (0, 0.0014), (0, 0.0015), (0, 0.0016), (0, 0.0017), (0, 0.0018), (0, 0.0019), (0, 0.002), (0, 0.0021), (0, 0.0022), (0, 0.0023), (0, 0.0024), (0, 0.0025), (0, 0.0026), (0, 0.0027), (0, 0.0028), (0, 0.0029), (0, 0.003), (0, 0.0031), (0, 0.0032), (0, 0.0033), (0, 0.0034), (0, 0.0035), (0, 0.0036), (0, 0.0037), (0, 0.0038), (0, 0.0039), (0, 0.004), (0, 0.0041), (0, 0.0042), (0, 0.0043), (0, 0.0044), (0, 0.0045), (0, 0.0046), (0, 0.0047), (0, 0.0048), (0, 0.0049), (0, 0.005), (0, -0.0007), (0, -0.0008)], [(0, 0.0297), (0, 0.0296), (0, 0.0298), (0, 0.0299), (0, 0.0301), (0, 0.03), (0, 0.0303), (0, 0.0302), (0, 0.0304), (0, 0.0305), (0, 0.0306), (0, 0.0308), (0, 0.0307), (0, 0.0309), (0, 0.031), (0, 0.0311), (0, 0.0313), (0, 0.0314), (0, 0.0312), (0, 0.0316), (0, 0.0315), (0, 0.0317), (0, 0.0318), (0, 0.032), (0, 0.0319), (0, 0.0322), (0, 0.0321), (0, 0.0324), (0, 0.0323), (0, 0.0326), (0, 0.0325), (0, 0.0328), (0, 0.033), (0, 0.0327), (0, 0.0332), (0, 0.0331), (0, 0.0333), (0, 0.0329), (0, 0.0335), (0, 0.0336), (0, 0.0334), (0, 0.0337), (0, 0.0338), (0, 0.0339), (0, 0.034), (0, 0.0341), (0, 0.0342), (0, 0.0343), (0, 0.0344), (0, 0.0345), (0, 0.0346), (0, 0.0348), (0, 0.0349), (0, 0.035), (0, 0.0351), (0, 0.0352), (0, 0.0355), (0, 0.0356), (0, 0.0358), (0, 0.0357), (0, 0.0359), (0, 0.036), (0, 0.0361), (0, 0.0362), (0, 0.0363), (0, 0.0365)], [(0, 0.0559), (0, 0.0564), (0, 0.0568), (0, 0.0571), (0, 0.0573), (0, 0.0579), (0, 0.0578), (0, 0.0581), (0, 0.0587), (0, 0.0589), (0, 0.0595), (0, 0.0591), (0, 0.0594), (0, 0.0604), (0, 0.0598), (0, 0.06), (0, 0.0602), (0, 0.0609), (0, 0.0612), (0, 0.059), (0, 0.0606), (0, 0.0614), (0, 0.0619), (0, 0.0625), (0, 0.0628), (0, 0.0615), (0, 0.0637), (0, 0.0633), (0, 0.0634), (0, 0.0636), (0, 0.0654), (0, 0.0658), (0, 0.0659), (0, 0.0669), (0, 0.0667), (0, 0.0664), (0, 0.067), (0, 0.0675), (0, 0.0673), (0, 0.0676), (0, 0.0686), (0, 0.07), (0, 0.0697), (0, 0.0709), (0, 0.0716), (0, 0.0717), (0, 0.0738), (0, 0.0747)], [(0, -0.0133), (0, -0.0132), (0, -0.0135), (0, -0.0134), (0, -0.0137), (0, -0.0138), (0, -0.0136), (0, -0.014), (0, -0.0139), (0, -0.0142), (0, -0.0143), (0, -0.0144), (0, -0.0141), (0, -0.0145), (0, -0.0146), (0, -0.0147), (0, -0.0148), (0, -0.0149), (0, -0.015), (0, -0.0151), (0, -0.0152), (0, -0.0153), (0, -0.0154), (0, -0.0155), (0, -0.0156), (0, -0.0157), (0, -0.0158), (0, -0.0159), (0, -0.016), (0, -0.0161), (0, -0.0162), (0, -0.0163), (0, -0.0164), (0, -0.0165), (0, -0.0166), (0, -0.0167), (0, -0.0168), (0, -0.0169), (0, -0.017), (0, -0.0171), (0, -0.0172), (0, -0.0173), (0, -0.0174), (0, -0.0175), (0, -0.0176), (0, -0.0177), (0, -0.0178), (0, -0.0179), (0, -0.018), (0, -0.0181), (0, -0.0182), (0, -0.0183), (0, -0.0184), (0, -0.0185), (0, -0.0186), (0, -0.0187), (0, -0.0188), (0, -0.0189), (0, -0.019), (0, -0.0191), (0, -0.0192), (0, -0.0193), (0, -0.0194), (0, -0.0195)], [(0, 0.0448), (0, 0.0451), (0, 0.0452), (0, 0.0446), (0, 0.0447), (0, 0.0456), (0, 0.045), (0, 0.0455), (0, 0.0462), (0, 0.0459), (0, 0.0461), (0, 0.0466), (0, 0.046), (0, 0.0467), (0, 0.0445), (0, 0.0458), (0, 0.0464), (0, 0.0477), (0, 0.0463), (0, 0.0472), (0, 0.0478), (0, 0.0457), (0, 0.0476), (0, 0.0481), (0, 0.0484), (0, 0.0488), (0, 0.0483), (0, 0.0487), (0, 0.0471), (0, 0.0482), (0, 0.0496), (0, 0.0474), (0, 0.0495), (0, 0.0485), (0, 0.0504), (0, 0.0505), (0, 0.0506), (0, 0.0501), (0, 0.0509), (0, 0.0508), (0, 0.051), (0, 0.0515), (0, 0.0516), (0, 0.052), (0, 0.0522), (0, 0.0524), (0, 0.053), (0, 0.0531), (0, 0.0534), (0, 0.0535), (0, 0.0536), (0, 0.0537), (0, 0.0538), (0, 0.0541), (0, 0.0542), (0, 0.0545), (0, 0.0546), (0, 0.0548), (0, 0.055)], [(0, 0.0172), (0, 0.017), (0, 0.0173), (0, 0.0174), (0, 0.0171), (0, 0.0177), (0, 0.0175), (0, 0.0178), (0, 0.0179), (0, 0.0176), (0, 0.0181), (0, 0.018), (0, 0.0183), (0, 0.0182), (0, 0.0186), (0, 0.0185), (0, 0.0187), (0, 0.0184), (0, 0.0189), (0, 0.0188), (0, 0.019), (0, 0.0191), (0, 0.0192), (0, 0.0194), (0, 0.0193), (0, 0.0196), (0, 0.0195), (0, 0.0197), (0, 0.0199), (0, 0.0198), (0, 0.02), (0, 0.0201), (0, 0.0202), (0, 0.0204), (0, 0.0205), (0, 0.0206), (0, 0.0203), (0, 0.0208), (0, 0.0207), (0, 0.021), (0, 0.0209), (0, 0.0211), (0, 0.0213), (0, 0.0212), (0, 0.0214), (0, 0.0215), (0, 0.0216), (0, 0.0217), (0, 0.0218), (0, 0.0219), (0, 0.022), (0, 0.0221), (0, 0.0222), (0, 0.0223), (0, 0.0224), (0, 0.0225), (0, 0.0226), (0, 0.0227), (0, 0.0228), (0, 0.0229), (0, 0.023), (0, 0.0231)], [(0, -0.0408), (0, -0.041), (0, -0.0411), (0, -0.0412), (0, -0.0413), (0, -0.0415), (0, -0.0416), (0, -0.0417), (0, -0.0419), (0, -0.042), (0, -0.0423), (0, -0.0424), (0, -0.0418), (0, -0.0425), (0, -0.0428), (0, -0.043), (0, -0.0431), (0, -0.0432), (0, -0.0433), (0, -0.0434), (0, -0.0436), (0, -0.0438), (0, -0.0439), (0, -0.044), (0, -0.0442), (0, -0.0441), (0, -0.0446), (0, -0.0443), (0, -0.0448), (0, -0.0447), (0, -0.045), (0, -0.0449), (0, -0.0453), (0, -0.0451), (0, -0.0454), (0, -0.0455), (0, -0.0458), (0, -0.0456), (0, -0.0459), (0, -0.0463), (0, -0.0461), (0, -0.046), (0, -0.0464), (0, -0.0465), (0, -0.0467), (0, -0.0462), (0, -0.0466), (0, -0.0472), (0, -0.0469), (0, -0.0475), (0, -0.0473), (0, -0.0478), (0, -0.0477), (0, -0.0476), (0, -0.0482), (0, -0.0481), (0, -0.0483), (0, -0.0487), (0, -0.0488), (0, -0.049), (0, -0.0492), (0, -0.0494)], [(0, -0.0261), (0, -0.0262), (0, -0.0263), (0, -0.0264), (0, -0.0266), (0, -0.0265), (0, -0.0267), (0, -0.0268), (0, -0.0269), (0, -0.0271), (0, -0.027), (0, -0.0273), (0, -0.0272), (0, -0.0275), (0, -0.0274), (0, -0.0277), (0, -0.0278), (0, -0.0276), (0, -0.0279), (0, -0.0281), (0, -0.028), (0, -0.0283), (0, -0.0282), (0, -0.0284), (0, -0.0285), (0, -0.0286), (0, -0.0287), (0, -0.0288), (0, -0.0289), (0, -0.0291), (0, -0.0292), (0, -0.0293), (0, -0.029), (0, -0.0294), (0, -0.0295), (0, -0.0297), (0, -0.0296), (0, -0.0299), (0, -0.03), (0, -0.0301), (0, -0.0302), (0, -0.0298), (0, -0.0303), (0, -0.0304), (0, -0.0307), (0, -0.0305), (0, -0.0308), (0, -0.031), (0, -0.0309), (0, -0.0312), (0, -0.0311), (0, -0.0313), (0, -0.0315), (0, -0.0314), (0, -0.0316), (0, -0.0317), (0, -0.0319), (0, -0.0318), (0, -0.032), (0, -0.0321), (0, -0.0322), (0, -0.0323), (0, -0.0325), (0, -0.0326), (0, -0.0327), (0, -0.0328), (0, -0.0329)], [(0, -0.0619), (0, -0.0622), (0, -0.0627), (0, -0.064), (0, -0.0645), (0, -0.065), (0, -0.0653), (0, -0.0651), (0, -0.0659), (0, -0.0663), (0, -0.0665), (0, -0.066), (0, -0.0666), (0, -0.0674), (0, -0.0671), (0, -0.0684), (0, -0.0672), (0, -0.0691), (0, -0.0689), (0, -0.0692), (0, -0.0701), (0, -0.0698), (0, -0.0709), (0, -0.0715), (0, -0.0717), (0, -0.0722), (0, -0.0734), (0, -0.0741), (0, -0.0749), (0, -0.0763), (0, -0.0772), (0, -0.0758), (0, -0.0762), (0, -0.0787), (0, -0.0788), (0, -0.0759), (0, -0.0775), (0, -0.0808)], [(0, -0.0905), (0, -0.1081), (0, -0.1018), (0, -0.094), (0, -0.0937), (0, -0.0936), (0, -0.0927), (0, -0.0919), (0, -0.0863), (0, -0.1593), (0, -0.1245), (0, -0.0847), (0, -0.1215), (0, -0.1139), (0, -0.1099), (0, -0.1068), (0, -0.0868), (0, -0.0856), (0, -0.0854), (0, -0.0837), (0, -0.0822), (0, -0.0877), (0, -0.1241), (0, -0.1073), (0, -0.1065), (0, -0.1011), (0, -0.0835)], [(0, -0.0196), (0, -0.0198), (0, -0.0197), (0, -0.0199), (0, -0.02), (0, -0.0201), (0, -0.0202), (0, -0.0204), (0, -0.0203), (0, -0.0205), (0, -0.0206), (0, -0.0208), (0, -0.0207), (0, -0.021), (0, -0.0209), (0, -0.0212), (0, -0.0211), (0, -0.0214), (0, -0.0215), (0, -0.0213), (0, -0.0217), (0, -0.0216), (0, -0.0219), (0, -0.0218), (0, -0.0221), (0, -0.022), (0, -0.0223), (0, -0.0222), (0, -0.0225), (0, -0.0224), (0, -0.0227), (0, -0.0226), (0, -0.0229), (0, -0.0228), (0, -0.023), (0, -0.0231), (0, -0.0232), (0, -0.0234), (0, -0.0233), (0, -0.0236), (0, -0.0235), (0, -0.0238), (0, -0.0237), (0, -0.024), (0, -0.0239), (0, -0.0242), (0, -0.0241), (0, -0.0244), (0, -0.0243), (0, -0.0245), (0, -0.0246), (0, -0.0247), (0, -0.0248), (0, -0.0249), (0, -0.025), (0, -0.0251), (0, -0.0252), (0, -0.0253), (0, -0.0254), (0, -0.0255), (0, -0.0256), (0, -0.0257), (0, -0.0258), (0, -0.0259), (0, -0.026)], [(0, -0.05), (0, -0.0504), (0, -0.0499), (0, -0.0507), (0, -0.0501), (0, -0.0509), (0, -0.0513), (0, -0.0505), (0, -0.051), (0, -0.0508), (0, -0.0517), (0, -0.0519), (0, -0.0516), (0, -0.052), (0, -0.0524), (0, -0.0525), (0, -0.0526), (0, -0.0528), (0, -0.0529), (0, -0.0533), (0, -0.0538), (0, -0.0535), (0, -0.0532), (0, -0.0542), (0, -0.0543), (0, -0.0546), (0, -0.054), (0, -0.055), (0, -0.0556), (0, -0.0545), (0, -0.056), (0, -0.0554), (0, -0.0567), (0, -0.0563), (0, -0.0571), (0, -0.0572), (0, -0.0576), (0, -0.0579), (0, -0.058), (0, -0.0584), (0, -0.0581), (0, -0.0588), (0, -0.0589), (0, -0.0591), (0, -0.0593), (0, -0.0596), (0, -0.0595), (0, -0.0601), (0, -0.0613), (0, -0.0614)], [(0, -0.001), (0, -0.0012), (0, -0.0017), (0, -0.0016), (0, -0.0013), (0, -0.0011), (0, -0.002), (0, -0.0018), (0, -0.0015), (0, -0.0014), (0, -0.0019), (0, -0.0021), (0, -0.0022), (0, -0.0023), (0, -0.0009), (0, -0.0024), (0, -0.0025), (0, -0.0026), (0, -0.0027), (0, -0.0028), (0, -0.0029), (0, -0.003), (0, -0.0031), (0, -0.0032), (0, -0.0033), (0, -0.0034), (0, -0.0035), (0, -0.0036), (0, -0.0037), (0, -0.0038), (0, -0.0039), (0, -0.004), (0, -0.0041), (0, -0.0042), (0, -0.0043), (0, -0.0044), (0, -0.0045), (0, -0.0046), (0, -0.0047), (0, -0.0048), (0, -0.0049), (0, -0.005), (0, -0.0051), (0, -0.0052), (0, -0.0053), (0, -0.0054), (0, -0.0055), (0, -0.0056), (0, -0.0057), (0, -0.0058), (0, -0.0059), (0, -0.006), (0, -0.0061), (0, -0.0062), (0, -0.0063), (0, -0.0064), (0, -0.0065), (0, -0.0066), (0, -0.0067), (0, -0.0068), (0, -0.0069)], [(0, -0.033), (0, -0.0332), (0, -0.0331), (0, -0.0334), (0, -0.0333), (0, -0.0336), (0, -0.0337), (0, -0.0335), (0, -0.0338), (0, -0.034), (0, -0.0339), (0, -0.0342), (0, -0.0343), (0, -0.0341), (0, -0.0344), (0, -0.0345), (0, -0.0346), (0, -0.0347), (0, -0.0348), (0, -0.035), (0, -0.0349), (0, -0.0351), (0, -0.0352), (0, -0.0353), (0, -0.0354), (0, -0.0355), (0, -0.0357), (0, -0.0356), (0, -0.0358), (0, -0.0359), (0, -0.0361), (0, -0.036), (0, -0.0363), (0, -0.0362), (0, -0.0365), (0, -0.0366), (0, -0.0364), (0, -0.0368), (0, -0.0369), (0, -0.0372), (0, -0.0371), (0, -0.0367), (0, -0.0375), (0, -0.0373), (0, -0.0376), (0, -0.0374), (0, -0.0378), (0, -0.038), (0, -0.0379), (0, -0.0377), (0, -0.0382), (0, -0.0384), (0, -0.0383), (0, -0.0386), (0, -0.0381), (0, -0.0387), (0, -0.0389), (0, -0.0385), (0, -0.039), (0, -0.0391), (0, -0.0388), (0, -0.0392), (0, -0.0395), (0, -0.0393), (0, -0.0397), (0, -0.0398), (0, -0.0396), (0, -0.0399), (0, -0.0402), (0, -0.0401), (0, -0.0403), (0, -0.0406), (0, -0.0407)], [(0, 0.0232), (0, 0.0233), (0, 0.0234), (0, 0.0235), (0, 0.0237), (0, 0.0236), (0, 0.0238), (0, 0.0239), (0, 0.024), (0, 0.0241), (0, 0.0242), (0, 0.0243), (0, 0.0244), (0, 0.0245), (0, 0.0247), (0, 0.0248), (0, 0.0246), (0, 0.0249), (0, 0.025), (0, 0.0251), (0, 0.0253), (0, 0.0252), (0, 0.0255), (0, 0.0254), (0, 0.0257), (0, 0.0256), (0, 0.0259), (0, 0.026), (0, 0.0258), (0, 0.0261), (0, 0.0262), (0, 0.0264), (0, 0.0265), (0, 0.0263), (0, 0.0267), (0, 0.0268), (0, 0.0266), (0, 0.027), (0, 0.0269), (0, 0.0271), (0, 0.0272), (0, 0.0274), (0, 0.0273), (0, 0.0276), (0, 0.0275), (0, 0.0277), (0, 0.0278), (0, 0.0279), (0, 0.0281), (0, 0.0282), (0, 0.0283), (0, 0.0284), (0, 0.0285), (0, 0.0286), (0, 0.0287), (0, 0.0288), (0, 0.0289), (0, 0.029), (0, 0.0291), (0, 0.0292), (0, 0.0293), (0, 0.0294)], [(0, 0.011), (0, 0.0112), (0, 0.0113), (0, 0.0111), (0, 0.0115), (0, 0.0114), (0, 0.0117), (0, 0.0116), (0, 0.0118), (0, 0.0119), (0, 0.0121), (0, 0.0122), (0, 0.0123), (0, 0.0124), (0, 0.012), (0, 0.0126), (0, 0.0125), (0, 0.0128), (0, 0.0127), (0, 0.013), (0, 0.0129), (0, 0.0131), (0, 0.0133), (0, 0.0132), (0, 0.0135), (0, 0.0134), (0, 0.0136), (0, 0.0137), (0, 0.0138), (0, 0.014), (0, 0.0139), (0, 0.0142), (0, 0.0141), (0, 0.0143), (0, 0.0144), (0, 0.0145), (0, 0.0146), (0, 0.0147), (0, 0.0148), (0, 0.0149), (0, 0.015), (0, 0.0151), (0, 0.0153), (0, 0.0152), (0, 0.0154), (0, 0.0155), (0, 0.0156), (0, 0.0157), (0, 0.0158), (0, 0.0159), (0, 0.016), (0, 0.0161), (0, 0.0162), (0, 0.0163), (0, 0.0164), (0, 0.0165), (0, 0.0166), (0, 0.0167), (0, 0.0168), (0, 0.0169)], [(0, -0.007), (0, -0.0071), (0, -0.0072), (0, -0.0073), (0, -0.0074), (0, -0.0075), (0, -0.0076), (0, -0.0077), (0, -0.0078), (0, -0.0079), (0, -0.0081), (0, -0.008), (0, -0.0082), (0, -0.0083), (0, -0.0084), (0, -0.0085), (0, -0.0086), (0, -0.0087), (0, -0.0088), (0, -0.0089), (0, -0.009), (0, -0.0091), (0, -0.0092), (0, -0.0093), (0, -0.0094), (0, -0.0095), (0, -0.0096), (0, -0.0097), (0, -0.0098), (0, -0.0099), (0, -0.01), (0, -0.0101), (0, -0.0102), (0, -0.0103), (0, -0.0104), (0, -0.0105), (0, -0.0106), (0, -0.0107), (0, -0.0108), (0, -0.0109), (0, -0.011), (0, -0.0111), (0, -0.0112), (0, -0.0113), (0, -0.0114), (0, -0.0115), (0, -0.0116), (0, -0.0117), (0, -0.0118), (0, -0.0119), (0, -0.012), (0, -0.0121), (0, -0.0122), (0, -0.0123), (0, -0.0124), (0, -0.0125), (0, -0.0126), (0, -0.0127), (0, -0.0128), (0, -0.0129), (0, -0.013), (0, -0.0131)], [(0, 0.0051), (0, 0.0052), (0, 0.0053), (0, 0.0055), (0, 0.0054), (0, 0.0057), (0, 0.0056), (0, 0.0059), (0, 0.0058), (0, 0.0061), (0, 0.006), (0, 0.0062), (0, 0.0063), (0, 0.0064), (0, 0.0065), (0, 0.0066), (0, 0.0068), (0, 0.0069), (0, 0.0067), (0, 0.007), (0, 0.0072), (0, 0.0071), (0, 0.0073), (0, 0.0074), (0, 0.0075), (0, 0.0076), (0, 0.0077), (0, 0.0078), (0, 0.0079), (0, 0.008), (0, 0.0081), (0, 0.0082), (0, 0.0083), (0, 0.0084), (0, 0.0085), (0, 0.0086), (0, 0.0087), (0, 0.0088), (0, 0.0089), (0, 0.009), (0, 0.0091), (0, 0.0092), (0, 0.0093), (0, 0.0094), (0, 0.0095), (0, 0.0096), (0, 0.0097), (0, 0.0098), (0, 0.0099), (0, 0.01), (0, 0.0101), (0, 0.0102), (0, 0.0103), (0, 0.0104), (0, 0.0105), (0, 0.0106), (0, 0.0107), (0, 0.0108), (0, 0.0109)], [(0, 0.0369), (0, 0.0371), (0, 0.0367), (0, 0.037), (0, 0.0375), (0, 0.0373), (0, 0.0376), (0, 0.0372), (0, 0.0377), (0, 0.038), (0, 0.0379), (0, 0.0374), (0, 0.0381), (0, 0.0382), (0, 0.0378), (0, 0.0384), (0, 0.0386), (0, 0.0387), (0, 0.0385), (0, 0.0389), (0, 0.0391), (0, 0.039), (0, 0.0392), (0, 0.0394), (0, 0.0395), (0, 0.0396), (0, 0.0398), (0, 0.0399), (0, 0.04), (0, 0.0401), (0, 0.0404), (0, 0.0405), (0, 0.0406), (0, 0.0407), (0, 0.0408), (0, 0.0409), (0, 0.041), (0, 0.0411), (0, 0.0412), (0, 0.0414), (0, 0.0415), (0, 0.0416), (0, 0.0417), (0, 0.0419), (0, 0.042), (0, 0.0421), (0, 0.0422), (0, 0.0426), (0, 0.0428), (0, 0.0427), (0, 0.043), (0, 0.0429), (0, 0.0431), (0, 0.0433), (0, 0.0434), (0, 0.0435), (0, 0.0436), (0, 0.0438), (0, 0.0437), (0, 0.044), (0, 0.0442), (0, 0.0444)]]
        print "EndTime: ", datetime.strftime(datetime.now(),
                                             "%Y-%m-%d %H:%M:%S")
        namedCluster = {}
        i = 0
        orderCluster = {}
        for clu in cluster:
            i = i + 1
            namedCluster[i] = clu
            orderCluster[i] = [min(clu)[1], max(clu)[1]]

        for m in orderCluster:
            min1 = orderCluster[m][0]
            max1 = orderCluster[m][1]
            for n in orderCluster:
                min2 = orderCluster[n][0]
                max2 = orderCluster[n][1]
                if (min1 > min2 and min1 < max2) or (max1 > min2
                                                     and max1 < max2):
                    print m, " intersect with ", n, " values: ", min1, max1, min2, max2

        clusterR = []
        for row in rows:
            for nc in namedCluster:
                if (0, row[5]) in namedCluster[nc]:
                    newRow = list(row)
                    newRow.append(nc)
                    clusterR.append(newRow)
                    finalClusterRecord.append(newRow)

        #insert the clusterR into Database
        insertSql = "insert into t_daily_enrichedIndex (embers_id,derived_from,sub_sequence,stock_index,date,last_price,one_day_change,change_percent,trend_type)values (?,?,?,?,?,?,?,?,?)"
        m = 0
        for j in clusterR:
            contentStr = json.dumps(j)
            embersId = hashlib.sha1(contentStr).hexdigest()
            derivedFrom = "[" + str(j[0]) + "]"
            subsequenceId = j[1]
            postDate = j[2]
            lastPrice = j[3]
            oneDayChange = j[4]
            changePercent = j[5]
            stockIndex = j[6]
            trendType = j[7]
            cur.execute(
                insertSql,
                (embersId, derivedFrom, subsequenceId, stockIndex, postDate,
                 lastPrice, oneDayChange, changePercent, trendType))
            m = m + 1
            if m % 1000 == 0:
                con.commit()
        con.commit()
        finalOrderCluster[stock] = orderCluster

    "Write the type range into a file"
    trendRangeFile = common.get_configuration("model", "TREND_RANGE_FILE")
    dataStr = json.dumps(finalOrderCluster)
    with open(trendRangeFile, "w") as output:
        output.write(dataStr)

    "Write the training data into file"
    trendSetRecordFile = common.get_configuration("model",
                                                  "TRAINING_TREND_RECORDS")
    dataStr = json.dumps(finalClusterRecord)
    with open(trendSetRecordFile, "w") as output:
        output.write(dataStr)

    if con:
        con.close()
Ejemplo n.º 37
0
CLUSTER_COUNT = 3
i = 0
for trend in mainTrend:
    location_list = list()
    for tweet in tweepy.Cursor(api.search,
                               q=trend['query']).items(TWEET_SAMPLE_SIZE):
        if tweet.user.location:
            print(i)
            i += 1
            try:
                location = geocoder.geocode(tweet.user.location)
                location_list.append({
                    "lat": location.latitude,
                    "lon": location.longitude
                })
            except Exception as e:
                print("An exception occurred: ")
                print(e)
                pass

with open('out.txt', 'w') as f:
    print(location_list, file=f)

cluster = KMeansClustering([(l['lat'], l['lon']) for l in location_list])
centroids = [centroid(c) for c in cluster.getclusters(CLUSTER_COUNT)]

kml_clusters = simplekml.Kml()
for i, c in enumerate(centroids):
    kml_clusters.newpoint(name='Cluster {}'.format(i), coords=[(c[1], c[0])])
    kml_clusters.save('{}.kml'.format(trend['query']))
Ejemplo n.º 38
0
    ((lat, lon), f) = coords_freqs[label]
    expanded_coords.append((label, [(lon, lat)] * f))  # Flip lat/lon for Google Earth

# No need to clutter the map with unnecessary placemarks...

kml_items = [{'label': label, 'coords': '%s,%s' % coords[0]} for (label,
                                                                  coords) in expanded_coords]

# It could also be interesting to include names of your contacts on the map for display

for item in kml_items:
    item['contacts'] = '\n'.join(['%s %s.' % (ec.first_name, ec.last_name[0])
                                  for ec in extended_connections if ec['location']
                                                                    == item['label']])

cl = KMeansClustering([coords for (label, coords_list) in expanded_coords
                       for coords in coords_list])

centroids = [{'label': 'CENTROID', 'coords': '%s,%s' % centroid(c)} for c in
             cl.getclusters(K)]

kml_items.extend(centroids)
kml = createKML(kml_items)

if not os.path.isdir('out'):
    os.mkdir('out')

f = open("out/" + OUT, 'w')
f.write(kml)
f.close()

print >> sys.stderr, 'Data pickled to out/' + OUT 
Ejemplo n.º 39
0
num = 0

# Collecting attrbutes of each result dir
for item in os.listdir(path):
	if os.path.isdir(item):
		data = item +"/cluster_attr.pacc"
		content = eval(open(data,'r').readline())
		clst_i.append(content)
		if num > LTT_NU:
			clst_i_2.append(content)
		num = num +1
		table[item] = content


# Starting cluster analysis process			
cl = KMeansClustering(clst_i)
if cl_param != "" and cl_param.isdigit() :
	clusters = cl.getclusters(int(cl_param))
else:
	clusters = cl.getclusters(2)

# Starting cluster analysis process -- syscall only
cl_2 = KMeansClustering(clst_i_2)
if cl_param != "" and cl_param.isdigit() :
	clusters_2 = cl_2.getclusters(int(cl_param))
else:
	clusters_2 = cl_2.getclusters(2)


# Cluster Density 
def add(x,y):
Ejemplo n.º 40
0
 def testClusterLen1(self):
     "Testing that a search space of length 1 returns only one cluster"
     cl = KMeansClustering([876])
     self.assertEqual([876], cl.getclusters(2))
     self.assertEqual([876], cl.getclusters(5))
f_categories = open("cat_less.txt", "r").read()
f_json = open("data_less.txt", "r").read()

json_data = json.loads(f_json)

categories = f_categories.split("\n")
k_means_list = []


# category = "Advertising Agencies"
for category in categories:
	try:
		if(json_data[category]):	
			for cat in json_data[category]:
				v = cat["latlon"]
				k_means_list.append((float(v.split(",")[0]), float(v.split(",")[1])))

			cl = KMeansClustering(k_means_list)
			clusters = cl.getclusters(12)
			# print category
			# print clusters
			cluster_file = open("./Output/" + category, "w")
			for cluster in clusters:
				for tup in cluster:
					# print tup[0]
					cluster_file.write(str(tup[0]) + "," + str(tup[1]) + " ")
				cluster_file.write("\n")
			print category + " Done"
			cluster_file.close()
	except Exception as e:
		pass
Ejemplo n.º 42
0
def clusterSet(traningEndDate): 
    con = common.getDBConnection()
    cur = con.cursor()
    
    finalClusterRecord = []
    stockList = ["MERVAL","MEXBOL","CHILE65","BVPSBVPS","COLCAP","CRSMBCT","IBOV","IGBVL"]
    finalOrderCluster = {}
    for stock in stockList:
        sql = "select embers_id,sub_sequence,date,last_price,one_day_change,round(one_day_change/(last_price-one_day_change),4),stock_index from t_daily_stockindex where stock_index=? and date<=?"
        cur.execute(sql,(stock,traningEndDate))
        rows = cur.fetchall()
        changes = [row[5] for row in rows]
        fdist = nltk.FreqDist(changes)
        clusterS = [(0,x) for x in fdist.keys()]
        
        print "StartTime: ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
        c1 = KMeansClustering(clusterS)
        print "MiddleTime: ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
        cluster = c1.getclusters(20)
#        cluster = [[(0, 0.0862), (0, 0.088), (0, 0.0914), (0, 0.094), (0, 0.0957), (0, 0.097), (0, 0.1017), (0, 0.1024), (0, 0.0774), (0, 0.0882), (0, 0.0783), (0, 0.11), (0, 0.0807), (0, 0.0813), (0, 0.1367), (0, 0.0831), (0, 0.0836), (0, 0.0855), (0, 0.0879), (0, 0.0912), (0, 0.0763), (0, 0.1046), (0, 0.0784), (0, 0.0815), (0, 0.1464), (0, 0.1987), (0, 0.1053), (0, 0.1101), (0, 0.1176), (0, 0.0868), (0, 0.1342), (0, 0.1466), (0, 0.0761), (0, 0.0772)], [(0, -0.0001), (0, 0.0), (0, 0.0001), (0, -0.0002), (0, -0.0003), (0, -0.0004), (0, -0.0005), (0, -0.0006), (0, 0.0002), (0, 0.0003), (0, 0.0004), (0, 0.0005), (0, 0.0006), (0, 0.0007), (0, 0.0008), (0, 0.0009), (0, 0.001), (0, 0.0011), (0, 0.0012), (0, 0.0013), (0, 0.0014), (0, 0.0015), (0, 0.0016), (0, 0.0017), (0, 0.0018), (0, 0.0019), (0, 0.002), (0, 0.0021), (0, 0.0022), (0, 0.0023), (0, 0.0024), (0, 0.0025), (0, 0.0026), (0, 0.0027), (0, 0.0028), (0, 0.0029), (0, 0.003), (0, 0.0031), (0, 0.0032), (0, 0.0033), (0, 0.0034), (0, 0.0035), (0, 0.0036), (0, 0.0037), (0, 0.0038), (0, 0.0039), (0, 0.004), (0, 0.0041), (0, 0.0042), (0, 0.0043), (0, 0.0044), (0, 0.0045), (0, 0.0046), (0, 0.0047), (0, 0.0048), (0, 0.0049), (0, 0.005), (0, -0.0007), (0, -0.0008)], [(0, 0.0297), (0, 0.0296), (0, 0.0298), (0, 0.0299), (0, 0.0301), (0, 0.03), (0, 0.0303), (0, 0.0302), (0, 0.0304), (0, 0.0305), (0, 0.0306), (0, 0.0308), (0, 0.0307), (0, 0.0309), (0, 0.031), (0, 0.0311), (0, 0.0313), (0, 0.0314), (0, 0.0312), (0, 0.0316), (0, 0.0315), (0, 0.0317), (0, 0.0318), (0, 0.032), (0, 0.0319), (0, 0.0322), (0, 0.0321), (0, 0.0324), (0, 0.0323), (0, 0.0326), (0, 0.0325), (0, 0.0328), (0, 0.033), (0, 0.0327), (0, 0.0332), (0, 0.0331), (0, 0.0333), (0, 0.0329), (0, 0.0335), (0, 0.0336), (0, 0.0334), (0, 0.0337), (0, 0.0338), (0, 0.0339), (0, 0.034), (0, 0.0341), (0, 0.0342), (0, 0.0343), (0, 0.0344), (0, 0.0345), (0, 0.0346), (0, 0.0348), (0, 0.0349), (0, 0.035), (0, 0.0351), (0, 0.0352), (0, 0.0355), (0, 0.0356), (0, 0.0358), (0, 0.0357), (0, 0.0359), (0, 0.036), (0, 0.0361), (0, 0.0362), (0, 0.0363), (0, 0.0365)], [(0, 0.0559), (0, 0.0564), (0, 0.0568), (0, 0.0571), (0, 0.0573), (0, 0.0579), (0, 0.0578), (0, 0.0581), (0, 0.0587), (0, 0.0589), (0, 0.0595), (0, 0.0591), (0, 0.0594), (0, 0.0604), (0, 0.0598), (0, 0.06), (0, 0.0602), (0, 0.0609), (0, 0.0612), (0, 0.059), (0, 0.0606), (0, 0.0614), (0, 0.0619), (0, 0.0625), (0, 0.0628), (0, 0.0615), (0, 0.0637), (0, 0.0633), (0, 0.0634), (0, 0.0636), (0, 0.0654), (0, 0.0658), (0, 0.0659), (0, 0.0669), (0, 0.0667), (0, 0.0664), (0, 0.067), (0, 0.0675), (0, 0.0673), (0, 0.0676), (0, 0.0686), (0, 0.07), (0, 0.0697), (0, 0.0709), (0, 0.0716), (0, 0.0717), (0, 0.0738), (0, 0.0747)], [(0, -0.0133), (0, -0.0132), (0, -0.0135), (0, -0.0134), (0, -0.0137), (0, -0.0138), (0, -0.0136), (0, -0.014), (0, -0.0139), (0, -0.0142), (0, -0.0143), (0, -0.0144), (0, -0.0141), (0, -0.0145), (0, -0.0146), (0, -0.0147), (0, -0.0148), (0, -0.0149), (0, -0.015), (0, -0.0151), (0, -0.0152), (0, -0.0153), (0, -0.0154), (0, -0.0155), (0, -0.0156), (0, -0.0157), (0, -0.0158), (0, -0.0159), (0, -0.016), (0, -0.0161), (0, -0.0162), (0, -0.0163), (0, -0.0164), (0, -0.0165), (0, -0.0166), (0, -0.0167), (0, -0.0168), (0, -0.0169), (0, -0.017), (0, -0.0171), (0, -0.0172), (0, -0.0173), (0, -0.0174), (0, -0.0175), (0, -0.0176), (0, -0.0177), (0, -0.0178), (0, -0.0179), (0, -0.018), (0, -0.0181), (0, -0.0182), (0, -0.0183), (0, -0.0184), (0, -0.0185), (0, -0.0186), (0, -0.0187), (0, -0.0188), (0, -0.0189), (0, -0.019), (0, -0.0191), (0, -0.0192), (0, -0.0193), (0, -0.0194), (0, -0.0195)], [(0, 0.0448), (0, 0.0451), (0, 0.0452), (0, 0.0446), (0, 0.0447), (0, 0.0456), (0, 0.045), (0, 0.0455), (0, 0.0462), (0, 0.0459), (0, 0.0461), (0, 0.0466), (0, 0.046), (0, 0.0467), (0, 0.0445), (0, 0.0458), (0, 0.0464), (0, 0.0477), (0, 0.0463), (0, 0.0472), (0, 0.0478), (0, 0.0457), (0, 0.0476), (0, 0.0481), (0, 0.0484), (0, 0.0488), (0, 0.0483), (0, 0.0487), (0, 0.0471), (0, 0.0482), (0, 0.0496), (0, 0.0474), (0, 0.0495), (0, 0.0485), (0, 0.0504), (0, 0.0505), (0, 0.0506), (0, 0.0501), (0, 0.0509), (0, 0.0508), (0, 0.051), (0, 0.0515), (0, 0.0516), (0, 0.052), (0, 0.0522), (0, 0.0524), (0, 0.053), (0, 0.0531), (0, 0.0534), (0, 0.0535), (0, 0.0536), (0, 0.0537), (0, 0.0538), (0, 0.0541), (0, 0.0542), (0, 0.0545), (0, 0.0546), (0, 0.0548), (0, 0.055)], [(0, 0.0172), (0, 0.017), (0, 0.0173), (0, 0.0174), (0, 0.0171), (0, 0.0177), (0, 0.0175), (0, 0.0178), (0, 0.0179), (0, 0.0176), (0, 0.0181), (0, 0.018), (0, 0.0183), (0, 0.0182), (0, 0.0186), (0, 0.0185), (0, 0.0187), (0, 0.0184), (0, 0.0189), (0, 0.0188), (0, 0.019), (0, 0.0191), (0, 0.0192), (0, 0.0194), (0, 0.0193), (0, 0.0196), (0, 0.0195), (0, 0.0197), (0, 0.0199), (0, 0.0198), (0, 0.02), (0, 0.0201), (0, 0.0202), (0, 0.0204), (0, 0.0205), (0, 0.0206), (0, 0.0203), (0, 0.0208), (0, 0.0207), (0, 0.021), (0, 0.0209), (0, 0.0211), (0, 0.0213), (0, 0.0212), (0, 0.0214), (0, 0.0215), (0, 0.0216), (0, 0.0217), (0, 0.0218), (0, 0.0219), (0, 0.022), (0, 0.0221), (0, 0.0222), (0, 0.0223), (0, 0.0224), (0, 0.0225), (0, 0.0226), (0, 0.0227), (0, 0.0228), (0, 0.0229), (0, 0.023), (0, 0.0231)], [(0, -0.0408), (0, -0.041), (0, -0.0411), (0, -0.0412), (0, -0.0413), (0, -0.0415), (0, -0.0416), (0, -0.0417), (0, -0.0419), (0, -0.042), (0, -0.0423), (0, -0.0424), (0, -0.0418), (0, -0.0425), (0, -0.0428), (0, -0.043), (0, -0.0431), (0, -0.0432), (0, -0.0433), (0, -0.0434), (0, -0.0436), (0, -0.0438), (0, -0.0439), (0, -0.044), (0, -0.0442), (0, -0.0441), (0, -0.0446), (0, -0.0443), (0, -0.0448), (0, -0.0447), (0, -0.045), (0, -0.0449), (0, -0.0453), (0, -0.0451), (0, -0.0454), (0, -0.0455), (0, -0.0458), (0, -0.0456), (0, -0.0459), (0, -0.0463), (0, -0.0461), (0, -0.046), (0, -0.0464), (0, -0.0465), (0, -0.0467), (0, -0.0462), (0, -0.0466), (0, -0.0472), (0, -0.0469), (0, -0.0475), (0, -0.0473), (0, -0.0478), (0, -0.0477), (0, -0.0476), (0, -0.0482), (0, -0.0481), (0, -0.0483), (0, -0.0487), (0, -0.0488), (0, -0.049), (0, -0.0492), (0, -0.0494)], [(0, -0.0261), (0, -0.0262), (0, -0.0263), (0, -0.0264), (0, -0.0266), (0, -0.0265), (0, -0.0267), (0, -0.0268), (0, -0.0269), (0, -0.0271), (0, -0.027), (0, -0.0273), (0, -0.0272), (0, -0.0275), (0, -0.0274), (0, -0.0277), (0, -0.0278), (0, -0.0276), (0, -0.0279), (0, -0.0281), (0, -0.028), (0, -0.0283), (0, -0.0282), (0, -0.0284), (0, -0.0285), (0, -0.0286), (0, -0.0287), (0, -0.0288), (0, -0.0289), (0, -0.0291), (0, -0.0292), (0, -0.0293), (0, -0.029), (0, -0.0294), (0, -0.0295), (0, -0.0297), (0, -0.0296), (0, -0.0299), (0, -0.03), (0, -0.0301), (0, -0.0302), (0, -0.0298), (0, -0.0303), (0, -0.0304), (0, -0.0307), (0, -0.0305), (0, -0.0308), (0, -0.031), (0, -0.0309), (0, -0.0312), (0, -0.0311), (0, -0.0313), (0, -0.0315), (0, -0.0314), (0, -0.0316), (0, -0.0317), (0, -0.0319), (0, -0.0318), (0, -0.032), (0, -0.0321), (0, -0.0322), (0, -0.0323), (0, -0.0325), (0, -0.0326), (0, -0.0327), (0, -0.0328), (0, -0.0329)], [(0, -0.0619), (0, -0.0622), (0, -0.0627), (0, -0.064), (0, -0.0645), (0, -0.065), (0, -0.0653), (0, -0.0651), (0, -0.0659), (0, -0.0663), (0, -0.0665), (0, -0.066), (0, -0.0666), (0, -0.0674), (0, -0.0671), (0, -0.0684), (0, -0.0672), (0, -0.0691), (0, -0.0689), (0, -0.0692), (0, -0.0701), (0, -0.0698), (0, -0.0709), (0, -0.0715), (0, -0.0717), (0, -0.0722), (0, -0.0734), (0, -0.0741), (0, -0.0749), (0, -0.0763), (0, -0.0772), (0, -0.0758), (0, -0.0762), (0, -0.0787), (0, -0.0788), (0, -0.0759), (0, -0.0775), (0, -0.0808)], [(0, -0.0905), (0, -0.1081), (0, -0.1018), (0, -0.094), (0, -0.0937), (0, -0.0936), (0, -0.0927), (0, -0.0919), (0, -0.0863), (0, -0.1593), (0, -0.1245), (0, -0.0847), (0, -0.1215), (0, -0.1139), (0, -0.1099), (0, -0.1068), (0, -0.0868), (0, -0.0856), (0, -0.0854), (0, -0.0837), (0, -0.0822), (0, -0.0877), (0, -0.1241), (0, -0.1073), (0, -0.1065), (0, -0.1011), (0, -0.0835)], [(0, -0.0196), (0, -0.0198), (0, -0.0197), (0, -0.0199), (0, -0.02), (0, -0.0201), (0, -0.0202), (0, -0.0204), (0, -0.0203), (0, -0.0205), (0, -0.0206), (0, -0.0208), (0, -0.0207), (0, -0.021), (0, -0.0209), (0, -0.0212), (0, -0.0211), (0, -0.0214), (0, -0.0215), (0, -0.0213), (0, -0.0217), (0, -0.0216), (0, -0.0219), (0, -0.0218), (0, -0.0221), (0, -0.022), (0, -0.0223), (0, -0.0222), (0, -0.0225), (0, -0.0224), (0, -0.0227), (0, -0.0226), (0, -0.0229), (0, -0.0228), (0, -0.023), (0, -0.0231), (0, -0.0232), (0, -0.0234), (0, -0.0233), (0, -0.0236), (0, -0.0235), (0, -0.0238), (0, -0.0237), (0, -0.024), (0, -0.0239), (0, -0.0242), (0, -0.0241), (0, -0.0244), (0, -0.0243), (0, -0.0245), (0, -0.0246), (0, -0.0247), (0, -0.0248), (0, -0.0249), (0, -0.025), (0, -0.0251), (0, -0.0252), (0, -0.0253), (0, -0.0254), (0, -0.0255), (0, -0.0256), (0, -0.0257), (0, -0.0258), (0, -0.0259), (0, -0.026)], [(0, -0.05), (0, -0.0504), (0, -0.0499), (0, -0.0507), (0, -0.0501), (0, -0.0509), (0, -0.0513), (0, -0.0505), (0, -0.051), (0, -0.0508), (0, -0.0517), (0, -0.0519), (0, -0.0516), (0, -0.052), (0, -0.0524), (0, -0.0525), (0, -0.0526), (0, -0.0528), (0, -0.0529), (0, -0.0533), (0, -0.0538), (0, -0.0535), (0, -0.0532), (0, -0.0542), (0, -0.0543), (0, -0.0546), (0, -0.054), (0, -0.055), (0, -0.0556), (0, -0.0545), (0, -0.056), (0, -0.0554), (0, -0.0567), (0, -0.0563), (0, -0.0571), (0, -0.0572), (0, -0.0576), (0, -0.0579), (0, -0.058), (0, -0.0584), (0, -0.0581), (0, -0.0588), (0, -0.0589), (0, -0.0591), (0, -0.0593), (0, -0.0596), (0, -0.0595), (0, -0.0601), (0, -0.0613), (0, -0.0614)], [(0, -0.001), (0, -0.0012), (0, -0.0017), (0, -0.0016), (0, -0.0013), (0, -0.0011), (0, -0.002), (0, -0.0018), (0, -0.0015), (0, -0.0014), (0, -0.0019), (0, -0.0021), (0, -0.0022), (0, -0.0023), (0, -0.0009), (0, -0.0024), (0, -0.0025), (0, -0.0026), (0, -0.0027), (0, -0.0028), (0, -0.0029), (0, -0.003), (0, -0.0031), (0, -0.0032), (0, -0.0033), (0, -0.0034), (0, -0.0035), (0, -0.0036), (0, -0.0037), (0, -0.0038), (0, -0.0039), (0, -0.004), (0, -0.0041), (0, -0.0042), (0, -0.0043), (0, -0.0044), (0, -0.0045), (0, -0.0046), (0, -0.0047), (0, -0.0048), (0, -0.0049), (0, -0.005), (0, -0.0051), (0, -0.0052), (0, -0.0053), (0, -0.0054), (0, -0.0055), (0, -0.0056), (0, -0.0057), (0, -0.0058), (0, -0.0059), (0, -0.006), (0, -0.0061), (0, -0.0062), (0, -0.0063), (0, -0.0064), (0, -0.0065), (0, -0.0066), (0, -0.0067), (0, -0.0068), (0, -0.0069)], [(0, -0.033), (0, -0.0332), (0, -0.0331), (0, -0.0334), (0, -0.0333), (0, -0.0336), (0, -0.0337), (0, -0.0335), (0, -0.0338), (0, -0.034), (0, -0.0339), (0, -0.0342), (0, -0.0343), (0, -0.0341), (0, -0.0344), (0, -0.0345), (0, -0.0346), (0, -0.0347), (0, -0.0348), (0, -0.035), (0, -0.0349), (0, -0.0351), (0, -0.0352), (0, -0.0353), (0, -0.0354), (0, -0.0355), (0, -0.0357), (0, -0.0356), (0, -0.0358), (0, -0.0359), (0, -0.0361), (0, -0.036), (0, -0.0363), (0, -0.0362), (0, -0.0365), (0, -0.0366), (0, -0.0364), (0, -0.0368), (0, -0.0369), (0, -0.0372), (0, -0.0371), (0, -0.0367), (0, -0.0375), (0, -0.0373), (0, -0.0376), (0, -0.0374), (0, -0.0378), (0, -0.038), (0, -0.0379), (0, -0.0377), (0, -0.0382), (0, -0.0384), (0, -0.0383), (0, -0.0386), (0, -0.0381), (0, -0.0387), (0, -0.0389), (0, -0.0385), (0, -0.039), (0, -0.0391), (0, -0.0388), (0, -0.0392), (0, -0.0395), (0, -0.0393), (0, -0.0397), (0, -0.0398), (0, -0.0396), (0, -0.0399), (0, -0.0402), (0, -0.0401), (0, -0.0403), (0, -0.0406), (0, -0.0407)], [(0, 0.0232), (0, 0.0233), (0, 0.0234), (0, 0.0235), (0, 0.0237), (0, 0.0236), (0, 0.0238), (0, 0.0239), (0, 0.024), (0, 0.0241), (0, 0.0242), (0, 0.0243), (0, 0.0244), (0, 0.0245), (0, 0.0247), (0, 0.0248), (0, 0.0246), (0, 0.0249), (0, 0.025), (0, 0.0251), (0, 0.0253), (0, 0.0252), (0, 0.0255), (0, 0.0254), (0, 0.0257), (0, 0.0256), (0, 0.0259), (0, 0.026), (0, 0.0258), (0, 0.0261), (0, 0.0262), (0, 0.0264), (0, 0.0265), (0, 0.0263), (0, 0.0267), (0, 0.0268), (0, 0.0266), (0, 0.027), (0, 0.0269), (0, 0.0271), (0, 0.0272), (0, 0.0274), (0, 0.0273), (0, 0.0276), (0, 0.0275), (0, 0.0277), (0, 0.0278), (0, 0.0279), (0, 0.0281), (0, 0.0282), (0, 0.0283), (0, 0.0284), (0, 0.0285), (0, 0.0286), (0, 0.0287), (0, 0.0288), (0, 0.0289), (0, 0.029), (0, 0.0291), (0, 0.0292), (0, 0.0293), (0, 0.0294)], [(0, 0.011), (0, 0.0112), (0, 0.0113), (0, 0.0111), (0, 0.0115), (0, 0.0114), (0, 0.0117), (0, 0.0116), (0, 0.0118), (0, 0.0119), (0, 0.0121), (0, 0.0122), (0, 0.0123), (0, 0.0124), (0, 0.012), (0, 0.0126), (0, 0.0125), (0, 0.0128), (0, 0.0127), (0, 0.013), (0, 0.0129), (0, 0.0131), (0, 0.0133), (0, 0.0132), (0, 0.0135), (0, 0.0134), (0, 0.0136), (0, 0.0137), (0, 0.0138), (0, 0.014), (0, 0.0139), (0, 0.0142), (0, 0.0141), (0, 0.0143), (0, 0.0144), (0, 0.0145), (0, 0.0146), (0, 0.0147), (0, 0.0148), (0, 0.0149), (0, 0.015), (0, 0.0151), (0, 0.0153), (0, 0.0152), (0, 0.0154), (0, 0.0155), (0, 0.0156), (0, 0.0157), (0, 0.0158), (0, 0.0159), (0, 0.016), (0, 0.0161), (0, 0.0162), (0, 0.0163), (0, 0.0164), (0, 0.0165), (0, 0.0166), (0, 0.0167), (0, 0.0168), (0, 0.0169)], [(0, -0.007), (0, -0.0071), (0, -0.0072), (0, -0.0073), (0, -0.0074), (0, -0.0075), (0, -0.0076), (0, -0.0077), (0, -0.0078), (0, -0.0079), (0, -0.0081), (0, -0.008), (0, -0.0082), (0, -0.0083), (0, -0.0084), (0, -0.0085), (0, -0.0086), (0, -0.0087), (0, -0.0088), (0, -0.0089), (0, -0.009), (0, -0.0091), (0, -0.0092), (0, -0.0093), (0, -0.0094), (0, -0.0095), (0, -0.0096), (0, -0.0097), (0, -0.0098), (0, -0.0099), (0, -0.01), (0, -0.0101), (0, -0.0102), (0, -0.0103), (0, -0.0104), (0, -0.0105), (0, -0.0106), (0, -0.0107), (0, -0.0108), (0, -0.0109), (0, -0.011), (0, -0.0111), (0, -0.0112), (0, -0.0113), (0, -0.0114), (0, -0.0115), (0, -0.0116), (0, -0.0117), (0, -0.0118), (0, -0.0119), (0, -0.012), (0, -0.0121), (0, -0.0122), (0, -0.0123), (0, -0.0124), (0, -0.0125), (0, -0.0126), (0, -0.0127), (0, -0.0128), (0, -0.0129), (0, -0.013), (0, -0.0131)], [(0, 0.0051), (0, 0.0052), (0, 0.0053), (0, 0.0055), (0, 0.0054), (0, 0.0057), (0, 0.0056), (0, 0.0059), (0, 0.0058), (0, 0.0061), (0, 0.006), (0, 0.0062), (0, 0.0063), (0, 0.0064), (0, 0.0065), (0, 0.0066), (0, 0.0068), (0, 0.0069), (0, 0.0067), (0, 0.007), (0, 0.0072), (0, 0.0071), (0, 0.0073), (0, 0.0074), (0, 0.0075), (0, 0.0076), (0, 0.0077), (0, 0.0078), (0, 0.0079), (0, 0.008), (0, 0.0081), (0, 0.0082), (0, 0.0083), (0, 0.0084), (0, 0.0085), (0, 0.0086), (0, 0.0087), (0, 0.0088), (0, 0.0089), (0, 0.009), (0, 0.0091), (0, 0.0092), (0, 0.0093), (0, 0.0094), (0, 0.0095), (0, 0.0096), (0, 0.0097), (0, 0.0098), (0, 0.0099), (0, 0.01), (0, 0.0101), (0, 0.0102), (0, 0.0103), (0, 0.0104), (0, 0.0105), (0, 0.0106), (0, 0.0107), (0, 0.0108), (0, 0.0109)], [(0, 0.0369), (0, 0.0371), (0, 0.0367), (0, 0.037), (0, 0.0375), (0, 0.0373), (0, 0.0376), (0, 0.0372), (0, 0.0377), (0, 0.038), (0, 0.0379), (0, 0.0374), (0, 0.0381), (0, 0.0382), (0, 0.0378), (0, 0.0384), (0, 0.0386), (0, 0.0387), (0, 0.0385), (0, 0.0389), (0, 0.0391), (0, 0.039), (0, 0.0392), (0, 0.0394), (0, 0.0395), (0, 0.0396), (0, 0.0398), (0, 0.0399), (0, 0.04), (0, 0.0401), (0, 0.0404), (0, 0.0405), (0, 0.0406), (0, 0.0407), (0, 0.0408), (0, 0.0409), (0, 0.041), (0, 0.0411), (0, 0.0412), (0, 0.0414), (0, 0.0415), (0, 0.0416), (0, 0.0417), (0, 0.0419), (0, 0.042), (0, 0.0421), (0, 0.0422), (0, 0.0426), (0, 0.0428), (0, 0.0427), (0, 0.043), (0, 0.0429), (0, 0.0431), (0, 0.0433), (0, 0.0434), (0, 0.0435), (0, 0.0436), (0, 0.0438), (0, 0.0437), (0, 0.044), (0, 0.0442), (0, 0.0444)]]
        print "EndTime: ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
        namedCluster = {}
        i = 0
        orderCluster = {}
        for clu in cluster:
            i = i + 1
            namedCluster[i] = clu
            orderCluster[i] = [min(clu)[1],max(clu)[1]] 
        
        for m in orderCluster:
            min1 = orderCluster[m][0]
            max1 = orderCluster[m][1]
            for n in orderCluster:
                min2 = orderCluster[n][0]
                max2 = orderCluster[n][1]
                if (min1 > min2 and min1 < max2) or (max1 > min2 and max1 < max2):
                    print m," intersect with ", n, " values: ",min1,max1,min2,max2
        
        clusterR = []
        for row in rows:
            for nc in namedCluster:
                if (0,row[5]) in namedCluster[nc]:
                    newRow = list(row)
                    newRow.append(nc)
                    clusterR.append(newRow)
                    finalClusterRecord.append(newRow)

        #insert the clusterR into Database
        insertSql = "insert into t_daily_enrichedIndex (embers_id,derived_from,sub_sequence,stock_index,date,last_price,one_day_change,change_percent,trend_type)values (?,?,?,?,?,?,?,?,?)"
        m = 0
        for j in clusterR:
            contentStr = json.dumps(j)
            embersId = hashlib.sha1(contentStr).hexdigest()
            derivedFrom = "[" + str(j[0]) + "]"
            subsequenceId = j[1]
            postDate = j[2]
            lastPrice = j[3]
            oneDayChange = j[4]
            changePercent = j[5]
            stockIndex = j[6]
            trendType = j[7]
            cur.execute(insertSql,(embersId,derivedFrom,subsequenceId,stockIndex,postDate,lastPrice,oneDayChange,changePercent,trendType))
            m = m + 1
            if m%1000 == 0:
                con.commit()
        con.commit() 
        finalOrderCluster[stock] = orderCluster
        
    "Write the type range into a file"
    trendRangeFile = common.get_configuration("model", "TREND_RANGE_FILE")
    dataStr = json.dumps(finalOrderCluster)
    with open(trendRangeFile,"w") as output:
        output.write(dataStr)
    
    "Write the training data into file"
    trendSetRecordFile = common.get_configuration("model", "TRAINING_TREND_RECORDS")
    dataStr = json.dumps(finalClusterRecord)
    with open(trendSetRecordFile,"w") as output:
        output.write(dataStr)
    
    if con:
        con.close()
f_categories = open("cat_less.txt", "r").read()
f_json = open("data_less.txt", "r").read()

json_data = json.loads(f_json)

categories = f_categories.split("\n")
k_means_list = []

# category = "Advertising Agencies"
for category in categories:
    try:
        if (json_data[category]):
            for cat in json_data[category]:
                v = cat["latlon"]
                k_means_list.append(
                    (float(v.split(",")[0]), float(v.split(",")[1])))

            cl = KMeansClustering(k_means_list)
            clusters = cl.getclusters(12)
            # print category
            # print clusters
            cluster_file = open("./Output/" + category, "w")
            for cluster in clusters:
                for tup in cluster:
                    # print tup[0]
                    cluster_file.write(str(tup[0]) + "," + str(tup[1]) + " ")
                cluster_file.write("\n")
            print category + " Done"
            cluster_file.close()
    except Exception as e:
        pass
Ejemplo n.º 44
0
    'NetIncomeLoss',
    'OperatingIncomeLoss',
    'PropertyPlantAndEquipmentNet',
    'RetainedEarningsAccumulatedDeficit',
    'StockholdersEquity',
]

# Importing the dataset
dataset_path = os.path.join(os.path.abspath(os.getcwd()), 'output',
                            'xbrl_dataset', '2017.csv')
dataset = pd.read_csv(dataset_path, usecols=cols)
dataset.fillna(0, inplace=True)
# dataset = dataset.transpose() # 'rotate' 90 degrees
# print(dataset)
# cor = dataset.corr() # Correlation of columns
#
# sns.heatmap(cor, square=True) # Plot the correlation as heat map
# plt.subplots_adjust(bottom=0.2, top=1, left=0.07, right=0.87)
# plt.show()

wh1 = dataset.head(100)

ss = StandardScaler()
ss.fit_transform(wh1)

wh1 = [tuple(x) for x in wh1.values]

cl = KMeansClustering(wh1)
clusters = cl.getclusters(3)

print(clusters)
Ejemplo n.º 45
0
expanded_coords = []
for label in coords_freqs:
    ((lat,lon),f)=coords_freqs[label]
    expanded_coords.append((label, [(lon, lat)]*f)) # flip lat/lon for google earth

# No need to clutter the map with unnecessary placemarks...

kml_items = [{'label': label, 'coords': '%s,%s' % coords[0]} for (label, coords) in expanded_coords]

# It could also be interesting to include names of your contacts on the map for display

for item in kml_items:
    item['contacts'] = '\n'.join(['%s %s.' % (ec.first_name, ec.last_name[0]) for ec in extended_connections if ec.location == item['label']])

cl = KMeansClustering([coords for (label, coords_list) in expanded_coords for coords in coords_list])

centroids=[{'label': 'CENTROID', 'coords': '%s,%s' % centroid(c)} for c in cl.getclusters(K)]

#kml_items.extend(centroids)
#kml=createKML(kml_items)

if not os.path.isdir('out'):
    os.mkdir('out')

f = open("out/" + OUT, 'w')
f.write(centroids)
f.close()

print >> sys.stderr, "Data pickled to out/" +OUT
    
Ejemplo n.º 46
0
#                    MAIN                          #
####################################################
sses = [0
        ] * 10  #stores the sse metric for each number of clusters from 5 to 50
num_users = 100
numsse = 0
numclusters = 5  # starts at 5
max_iterations = 10
start_time = datetime.datetime.now()
while numclusters <= 50:  # compute SSE from num_clusters=5 to 50
    users = []  # users are the items of this example
    for i in range(num_users):
        user = createProfile()
        users.append(user)
    print(" inicializing kmeans...")
    cl = KMeansClustering(users, HDdistItems, HDequals)
    print(" executing...", numclusters)
    st = datetime.datetime.now()
    print(st)
    numclusters = numclusters
    solution = cl.HDgetclusters(numclusters, max_iterations)
    for i in range(numclusters):
        a = solution[i]
        print(util.HDcentroid(a), ",")
    st = datetime.datetime.now()

    sses[numsse] = HDcomputeSSE(solution, numclusters)
    numsse += 1
    numclusters += 5
end_time = datetime.datetime.now()
print("start_time:", start_time)
Ejemplo n.º 47
0
from cluster import KMeansClustering
import random
import time
sample_space= [11,21,31,41,51,61,71,81,91,101,201,301,401,501,601,701,801,901,1001,2001,3001,4001,5001,6001,7001,8001,9001,10001]
#print "Input \n"
for key,value in enumerate(sample_space):
	#print "For value " + str(value) + "=>"
	a=[]
	for i in range(1,value):
		a.append((i*random.random(),i*random.random()))
		#print a
		#print "\nOutput \n"
	start_time=time.time()
	cl = KMeansClustering(a)
	clusters = cl.getclusters(9)
	end_time=time.time()
	#print clusters
	print "total time " + str(end_time-start_time) + " secs for "+ str(i) +" element"
Ejemplo n.º 48
0
 def testClusterCount(self):
     "Test that asking for less than 2 clusters raises an error"
     cl = KMeansClustering([876, 123, 344, 676],
                           distance=lambda x, y: abs(x - y))
     self.assertRaises(ClusteringError, cl.getclusters, 0)
     self.assertRaises(ClusteringError, cl.getclusters, 1)