def main(): train, test = load_train_and_test() xlim = [-74.03, -73.77] ylim = [40.6, 40.9] train_copy = train[["pickup_longitude", "dropoff_longitude", "pickup_latitude", "dropoff_latitude"]].copy() train_copy = train_copy[(train_copy.pickup_longitude >= xlim[0]) & (train_copy.pickup_longitude <= xlim[1])] train_copy = train_copy[(train_copy.dropoff_longitude >= xlim[0]) & (train_copy.dropoff_longitude <= xlim[1])] train_copy = train_copy[(train_copy.pickup_latitude >= ylim[0]) & (train_copy.pickup_latitude <= ylim[1])] train_copy = train_copy[(train_copy.dropoff_latitude >= ylim[0]) & (train_copy.dropoff_latitude <= ylim[1])] test_copy = test[["pickup_longitude", "dropoff_longitude", "pickup_latitude", "dropoff_latitude"]].copy() test_copy = test_copy[(test_copy.pickup_longitude >= xlim[0]) & (test_copy.pickup_longitude <= xlim[1])] test_copy = test_copy[(test_copy.dropoff_longitude >= xlim[0]) & (test_copy.dropoff_longitude <= xlim[1])] test_copy = test_copy[(test_copy.pickup_latitude >= ylim[0]) & (test_copy.pickup_latitude <= ylim[1])] test_copy = test_copy[(test_copy.dropoff_latitude >= ylim[0]) & (test_copy.dropoff_latitude <= ylim[1])] train_lons = list(train_copy.pickup_longitude) + list(train_copy.dropoff_longitude) train_lats = list(train_copy.pickup_latitude) + list(train_copy.dropoff_latitude) test_lons = list(test_copy.pickup_longitude) + list(test_copy.dropoff_longitude) test_lats = list(test_copy.pickup_latitude) + list(test_copy.dropoff_latitude) loc_df = pd.DataFrame() loc_df["lons"] = train_lons * (10 ** 3) loc_df["lats"] = train_lats * (10 ** 3) loc_df = loc_df.sample(50000) dbscan(loc_df, eps=1, min_samples=100, save_path="../data/CLUSTER/")
def do_work(work_queue, output): while 1: try: q = work_queue.get(block=False) # data print q['user'] t0 = time() (segments, outlierDict) = readSegments(q['record']) #classifier_results = classifier.classify(data, **q['classifier_parameters']['parameters']) #clustering res = cluster.dbscan(segments) #prototypeCreation resProto = cluster.prototypeSelection(segments, res) # save #templates={} out = { 'centroids': resProto["centroids"], 'nsamples_in_cluster': resProto["nsamples_in_cluster"], 'clusters': res["clusters"] } filename = q['out_dir'] + q['record'] + "-prototype-temp.pkl" fd = gzip.open(filename, 'wb') cPickle.dump(out, fd) fd.close() print "done in %0.3fs" % (time() - t0) except Empty: break
def customClusterQuestions(docs, algorithm, parameters, removeOutliers=True): """ Customized question clustering methods. Implemented as a knock-off to prevent interupting with production builds Parameters: docs (list): List of strings (questions) algorithm (str): The clustering algorithm that will be used (DBSCAN, K Means, Gaussian K Means, Agglomerative Clustering) parameters (string): Parameters used to initiate the clustering algorithms (epsilon, clusters, strictness, threshold) removeOutliers (bool): A flag to determine whether to remove outliers or not (default is True) Returns: corpus: Corpus that has clusters list attached to it """ corpus = tagAndVectorizeCorpus(docs) params = [{param["param"]: param["value"]} for param in parameters][0] if algorithm == "DBSCAN": return cluster.dbscan(corpus, float(params["epsilon"])) if algorithm == "K Means": return cluster.kmeans(corpus, int(params["clusters"]), removeOutliers) if algorithm == "Gaussian K Means": return cluster.g_kmeans(corpus, min(4, int(params["strictness"])), removeOutliers) if algorithm == "Agglomerative Clustering": return cluster.agglomerate(corpus, float(params["threshold"]), removeOutliers)
def generate_clusteredpoints(filepath, methodType, dataIndex, recordList): oripoints = processtool.read_points_fromfile(filepath) points = processtool.generate_plist(oripoints) if methodType == 'kmeans': kmeansfactory = kmeans(__kmeansk__[dataIndex], points) kmeansfactory.kmeansClusterWithRecord(recordList) #kmeansfactory.printResult() return kmeansfactory.points elif methodType == 'dbscan': #dbscanfactory = dbscan(1.1, 5, points) dbscanfactory = dbscan(epsSelect[dataIndex], MinPtsSelect[dataIndex], points) dbscanfactory.dbscanCluster() #dbscanfactory.printResult() return dbscanfactory.points
def sceneEval(inputObjectSet,params = ClusterParams(2,0.9,3,0.05,0.1,1,0,10)): ''' find the clusters evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters evaluate the outside of clusters for lines concatenate the lists of clusters and lines evaluate the whole thing with bundle search ''' clusterCandidates = clustercost(dbscan(np.array(map(lambda x: (x.position,x.id),inputObjectSet)))) lineCandidates = findChains(inputObjectSet,params) print'***' allCandidates = clusterCandidates + lineCandidates evali = bundleSearch(cluster_util.totuple(inputObjectSet), allCandidates, params.allow_intersection, 10) print evali return evali
def sceneEval(inputObjectSet,params = ClusterParams(2,0.9,3,0.05,0.1,1,0,11,False)): ''' find the clusters evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters evaluate the outside of clusters for lines concatenate the lists of clusters and lines evaluate the whole thing with bundle search ''' reducedObjectSet = copy(inputObjectSet) clusterCandidates = clustercost(dbscan(np.array(map(lambda x: (x.position,x.id),inputObjectSet)))) innerLines = [] #search for lines inside large clusters if params.attempt_dnc==True: for cluster in clusterCandidates[1]: innerObjects = [] for id in cluster: for x in inputObjectSet: if x.id == id: innerObjects.append(x) innerChains = findChains(innerObjects,params) for thing in innerChains: innerLines.append(thing) #remove core clusters for cluster in clusterCandidates[0]: for id in cluster: for x in reducedObjectSet: if x.id == id: reducedObjectSet.remove(x) lineCandidates = findChains(reducedObjectSet,params) allCandidates = clusterCandidates[0]+clusterCandidates[1] + lineCandidates + innerLines groupDictionary = dict() for i in allCandidates: groupDictionary[i.uuid]=i evali = bundleSearch(cluster_util.totuple(inputObjectSet), allCandidates, params.allow_intersection, params.beam_width) #find the things in evali that aren't in the dictionary ,and make a singleton group out of them, and add it to the output output = map(lambda x: groupDictionary.get(x),evali) return output
def sceneEval(inputObjectSet, params=ClusterParams(2, 0.9, 3, 0.05, 0.1, 1, 0, 11, False)): ''' find the clusters evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters evaluate the outside of clusters for lines concatenate the lists of clusters and lines evaluate the whole thing with bundle search ''' reducedObjectSet = copy(inputObjectSet) objectDict = dict() for i in inputObjectSet: objectDict[i.id] = i distanceMatrix = cluster_util.create_distance_matrix(inputObjectSet) dbtimestart = time() clusterCandidates = clustercost( dbscan(inputObjectSet, distanceMatrix, objectDict), objectDict) dbtimestop = time() print "dbscan time: \t\t\t", dbtimestop - dbtimestart # print 'clustercandidates',clusterCandidates innerLines = [] #search for lines inside large clusters if params.attempt_dnc == True: insideLineStart = time() for cluster in clusterCandidates[1]: innerObjects = [] for id in cluster: for x in inputObjectSet: if x.id == id: innerObjects.append(x) innerChains = findChains(innerObjects, params) for thing in innerChains: innerLines.append(thing) #remove core clusters for cluster in clusterCandidates[0]: for id in cluster: for x in reducedObjectSet: if x.id == id: reducedObjectSet.remove(x) ReducedDistanceMatrix = cluster_util.create_distance_matrix( reducedObjectSet) insideLineStop = time() print "inside linesearch time:\t\t", insideLineStop - insideLineStart outsideLineStart = time() lineCandidates = findChains(reducedObjectSet, params) outsideLineStop = time() # for i in scene: # groups.append(cluster_util.SingletonBundle([i[0]],1)) #need to implement singletons intelligently. print "general linesearch time:\t", outsideLineStop - outsideLineStart allCandidates = clusterCandidates[0] + clusterCandidates[ 1] + lineCandidates + innerLines groupDictionary = dict() for i in allCandidates: groupDictionary[i.uuid] = i for i in inputObjectSet: groupDictionary[i.uuid] = cluster_util.SingletonBundle([i.id], 1, i.uuid) bundleStart = time() evali = bundleSearch(inputObjectSet, allCandidates, params.allow_intersection, params.beam_width) bundleStop = time() print "bundlesearch time: \t\t", bundleStop - bundleStart #find the things in evali that aren't in the dictionary ,and make a singleton group out of them, and add it to the output #what the heck am i doing here? physicalobjects = [] for i in evali: try: physicalobjects.append(groupDictionary.get(i)) except: print "not in dictionary" output = map(lambda x: groupDictionary.get(x), evali) # print 'costs', map(lambda x: x.cost,output) return output
def sceneEval(inputObjectSet,params = ClusterParams(2,0.9,3,0.05,0.1,1,1,11,False)): ''' find the clusters evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters evaluate the outside of clusters for lines concatenate the lists of clusters and lines evaluate the whole thing with bundle search ''' print "*",inputObjectSet reducedObjectSet = copy(inputObjectSet) objectDict = dict() for i in inputObjectSet: objectDict[i.uuid]=i distanceMatrix = cluster_util.create_distance_matrix(inputObjectSet) dbtimestart = time() clusterCandidates = clustercost(dbscan(inputObjectSet,distanceMatrix,objectDict),objectDict) dbtimestop = time() print "dbscan time: \t\t\t", dbtimestop-dbtimestart innerLines = [] #search for lines inside large clusters if params.attempt_dnc==True: insideLineStart= time() for cluster in clusterCandidates[1]: innerObjects = [] for id in cluster: for x in inputObjectSet: if x.id == id: innerObjects.append(x) innerChains = findChains(innerObjects,params) for thing in innerChains: innerLines.append(thing) #remove core clusters for cluster in clusterCandidates[0]: for id in cluster: for x in reducedObjectSet: if x.id == id: reducedObjectSet.remove(x) ReducedDistanceMatrix = cluster_util.create_distance_matrix(reducedObjectSet) insideLineStop = time() print "inside linesearch time:\t\t",insideLineStop-insideLineStart outsideLineStart = time() lineCandidates = findChains(reducedObjectSet,params,objectDict) outsideLineStop = time() print "general linesearch time:\t",outsideLineStop-outsideLineStart allCandidates = clusterCandidates[0]+clusterCandidates[1] + lineCandidates + innerLines allClusters = clusterCandidates[0]+clusterCandidates[1] allLines = lineCandidates + innerLines groupDictionary = dict() for i in allCandidates: groupDictionary[i.uuid]=i for i in inputObjectSet: groupDictionary[i.uuid]=i lineBundleStart = time() bestLines = bundleSearch(inputObjectSet, allLines, params.allow_intersection, params.beam_width) lineBundleStop = time() print "linebundle time: \t\t", lineBundleStop-lineBundleStart clusterBundleStart = time() bestClusters = bundleSearch(inputObjectSet, allClusters, params.allow_intersection, params.beam_width) print params.allow_intersection clusterBundleStop = time() print "clusterbundle time: \t\t", clusterBundleStop-clusterBundleStart bundleStart = time() evali = [] try: evali = evali + bestLines except: print "there aren't any lines." try: evali = evali + bestClusters except: print "there aren't any clusterss." print "evali",evali output = map(lambda x: groupDictionary.get(x),evali) bundleStop = time() print "bundlesearch cleanup time: \t",bundleStop-bundleStart print "output",output return output
def sceneEval(inputObjectSet,params = ClusterParams(2,0.9,3,0.05,0.1,1,0,11,False)): ''' find the clusters evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters evaluate the outside of clusters for lines concatenate the lists of clusters and lines evaluate the whole thing with bundle search ''' reducedObjectSet = copy(inputObjectSet) objectDict = dict() for i in inputObjectSet: objectDict[i.id]=i distanceMatrix = cluster_util.create_distance_matrix(inputObjectSet) dbtimestart = time() clusterCandidates = clustercost(dbscan(inputObjectSet,distanceMatrix,objectDict),objectDict) dbtimestop = time() print "dbscan time: \t\t\t", dbtimestop-dbtimestart # print 'clustercandidates',clusterCandidates innerLines = [] #search for lines inside large clusters if params.attempt_dnc==True: insideLineStart= time() for cluster in clusterCandidates[1]: innerObjects = [] for id in cluster: for x in inputObjectSet: if x.id == id: innerObjects.append(x) innerChains = findChains(innerObjects,params) for thing in innerChains: innerLines.append(thing) #remove core clusters for cluster in clusterCandidates[0]: for id in cluster: for x in reducedObjectSet: if x.id == id: reducedObjectSet.remove(x) ReducedDistanceMatrix = cluster_util.create_distance_matrix(reducedObjectSet) insideLineStop = time() print "inside linesearch time:\t\t",insideLineStop-insideLineStart outsideLineStart = time() lineCandidates = findChains(reducedObjectSet,params) outsideLineStop = time() # for i in scene: # groups.append(cluster_util.SingletonBundle([i[0]],1)) #need to implement singletons intelligently. print "general linesearch time:\t",outsideLineStop-outsideLineStart allCandidates = clusterCandidates[0]+clusterCandidates[1] + lineCandidates + innerLines groupDictionary = dict() for i in allCandidates: groupDictionary[i.uuid]=i for i in inputObjectSet: groupDictionary[i.uuid]=cluster_util.SingletonBundle([i.id],1,i.uuid) bundleStart = time() evali = bundleSearch(inputObjectSet, allCandidates, params.allow_intersection, params.beam_width) bundleStop = time() print "bundlesearch time: \t\t",bundleStop-bundleStart #find the things in evali that aren't in the dictionary ,and make a singleton group out of them, and add it to the output #what the heck am i doing here? physicalobjects = [] for i in evali: try: physicalobjects.append(groupDictionary.get(i)) except: print "not in dictionary" output = map(lambda x: groupDictionary.get(x),evali) # print 'costs', map(lambda x: x.cost,output) return output