Python dbscan Beispiele, cluster.dbscan Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: figure.py Projekt: pratvi123/kaggle-TaxiTripDuration

def main():
    train, test = load_train_and_test()
    xlim = [-74.03, -73.77]
    ylim = [40.6, 40.9]

    train_copy = train[["pickup_longitude", "dropoff_longitude", "pickup_latitude", "dropoff_latitude"]].copy()

    train_copy = train_copy[(train_copy.pickup_longitude >= xlim[0]) & (train_copy.pickup_longitude <= xlim[1])]
    train_copy = train_copy[(train_copy.dropoff_longitude >= xlim[0]) & (train_copy.dropoff_longitude <= xlim[1])]
    train_copy = train_copy[(train_copy.pickup_latitude >= ylim[0]) & (train_copy.pickup_latitude <= ylim[1])]
    train_copy = train_copy[(train_copy.dropoff_latitude >= ylim[0]) & (train_copy.dropoff_latitude <= ylim[1])]

    test_copy = test[["pickup_longitude", "dropoff_longitude", "pickup_latitude", "dropoff_latitude"]].copy()

    test_copy = test_copy[(test_copy.pickup_longitude >= xlim[0]) & (test_copy.pickup_longitude <= xlim[1])]
    test_copy = test_copy[(test_copy.dropoff_longitude >= xlim[0]) & (test_copy.dropoff_longitude <= xlim[1])]
    test_copy = test_copy[(test_copy.pickup_latitude >= ylim[0]) & (test_copy.pickup_latitude <= ylim[1])]
    test_copy = test_copy[(test_copy.dropoff_latitude >= ylim[0]) & (test_copy.dropoff_latitude <= ylim[1])]

    train_lons = list(train_copy.pickup_longitude) + list(train_copy.dropoff_longitude)
    train_lats = list(train_copy.pickup_latitude) + list(train_copy.dropoff_latitude)

    test_lons = list(test_copy.pickup_longitude) + list(test_copy.dropoff_longitude)
    test_lats = list(test_copy.pickup_latitude) + list(test_copy.dropoff_latitude)

    loc_df = pd.DataFrame()
    loc_df["lons"] = train_lons * (10 ** 3)
    loc_df["lats"] = train_lats * (10 ** 3)

    loc_df = loc_df.sample(50000)
    dbscan(loc_df, eps=1, min_samples=100, save_path="../data/CLUSTER/")

Beispiel #2

0

Datei anzeigen

def do_work(work_queue, output):
    while 1:
        try:
            q = work_queue.get(block=False)
            # data
            print q['user']
            t0 = time()
            (segments, outlierDict) = readSegments(q['record'])

            #classifier_results = classifier.classify(data, **q['classifier_parameters']['parameters'])
            #clustering
            res = cluster.dbscan(segments)

            #prototypeCreation
            resProto = cluster.prototypeSelection(segments, res)

            # save
            #templates={}
            out = {
                'centroids': resProto["centroids"],
                'nsamples_in_cluster': resProto["nsamples_in_cluster"],
                'clusters': res["clusters"]
            }

            filename = q['out_dir'] + q['record'] + "-prototype-temp.pkl"

            fd = gzip.open(filename, 'wb')
            cPickle.dump(out, fd)
            fd.close()
            print "done in %0.3fs" % (time() - t0)

        except Empty:
            break

Beispiel #3

0

Datei anzeigen

Datei: analysis.py Projekt: BeagleLearning/beagleNLP

def customClusterQuestions(docs, algorithm, parameters, removeOutliers=True):
    """
    Customized question clustering methods. Implemented as a knock-off to prevent interupting with production builds

    Parameters:
        docs (list): List of strings (questions)
        algorithm (str):  The clustering algorithm that will be used
            (DBSCAN, K Means, Gaussian K Means, Agglomerative Clustering)
        parameters (string): Parameters used to initiate the clustering algorithms
            (epsilon, clusters, strictness, threshold)
        removeOutliers (bool): A flag to determine whether to remove outliers or not
            (default is True)

    Returns:
        corpus: Corpus that has clusters list attached to it
    """

    corpus = tagAndVectorizeCorpus(docs)
    params = [{param["param"]: param["value"]} for param in parameters][0]
    if algorithm == "DBSCAN":
        return cluster.dbscan(corpus, float(params["epsilon"]))

    if algorithm == "K Means":
        return cluster.kmeans(corpus, int(params["clusters"]), removeOutliers)

    if algorithm == "Gaussian K Means":
        return cluster.g_kmeans(corpus, min(4, int(params["strictness"])), removeOutliers)

    if algorithm == "Agglomerative Clustering":
        return cluster.agglomerate(corpus, float(params["threshold"]), removeOutliers)

Beispiel #4

0

Datei anzeigen

 def generate_clusteredpoints(filepath, methodType, dataIndex, recordList):
     oripoints = processtool.read_points_fromfile(filepath)
     points = processtool.generate_plist(oripoints)
     if methodType == 'kmeans':
         kmeansfactory = kmeans(__kmeansk__[dataIndex], points)
         kmeansfactory.kmeansClusterWithRecord(recordList)
         #kmeansfactory.printResult()
         return kmeansfactory.points
     elif methodType == 'dbscan':
         #dbscanfactory = dbscan(1.1, 5, points)
         dbscanfactory = dbscan(epsSelect[dataIndex], MinPtsSelect[dataIndex], points)
         dbscanfactory.dbscanCluster()
         #dbscanfactory.printResult()
         return dbscanfactory.points

Beispiel #5

0

Datei anzeigen

Datei: sceneEval.py Projekt: colinwinslow/bolt_chainfinder

def sceneEval(inputObjectSet,params = ClusterParams(2,0.9,3,0.05,0.1,1,0,10)):
    
    '''
    find the clusters
    evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters
    evaluate the outside of clusters for lines
    concatenate the lists of clusters and lines
    evaluate the whole thing with bundle search
    '''
    
    clusterCandidates = clustercost(dbscan(np.array(map(lambda x: (x.position,x.id),inputObjectSet))))
    lineCandidates = findChains(inputObjectSet,params)
    print'***'
    allCandidates = clusterCandidates + lineCandidates
    evali = bundleSearch(cluster_util.totuple(inputObjectSet), allCandidates, params.allow_intersection, 10)
    print evali
    return evali

Beispiel #6

0

Datei anzeigen

Datei: SceneEval.py Projekt: marcovzla/bolt

def sceneEval(inputObjectSet,params = ClusterParams(2,0.9,3,0.05,0.1,1,0,11,False)):
    
    '''
    find the clusters
    evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters
    evaluate the outside of clusters for lines
    concatenate the lists of clusters and lines
    evaluate the whole thing with bundle search
    '''
    reducedObjectSet = copy(inputObjectSet)
    
    clusterCandidates = clustercost(dbscan(np.array(map(lambda x: (x.position,x.id),inputObjectSet))))
    
    innerLines = []
    #search for lines inside large clusters
    if params.attempt_dnc==True:
        for cluster in clusterCandidates[1]:

            innerObjects = []
            for id in cluster:
                for x in inputObjectSet:
                    if x.id == id:
                        innerObjects.append(x)
            innerChains = findChains(innerObjects,params)
            for thing in innerChains:
                innerLines.append(thing)
            
        #remove core clusters
        for cluster in clusterCandidates[0]:
            for id in cluster:
                for x in reducedObjectSet:
                    if x.id == id:
                        reducedObjectSet.remove(x)

    lineCandidates = findChains(reducedObjectSet,params)
    
    allCandidates = clusterCandidates[0]+clusterCandidates[1] + lineCandidates + innerLines
    groupDictionary = dict()
    for i in allCandidates:
        groupDictionary[i.uuid]=i
    evali = bundleSearch(cluster_util.totuple(inputObjectSet), allCandidates, params.allow_intersection, params.beam_width)   
    #find the things in evali that aren't in the dictionary ,and make a singleton group out of them, and add it to the output
    output = map(lambda x: groupDictionary.get(x),evali)
    return output

Beispiel #7

0

Datei anzeigen

Datei: SceneEval.py Projekt: arebgun/bolt-configurations

def sceneEval(inputObjectSet,
              params=ClusterParams(2, 0.9, 3, 0.05, 0.1, 1, 0, 11, False)):
    '''
    find the clusters
    evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters
    evaluate the outside of clusters for lines
    concatenate the lists of clusters and lines
    evaluate the whole thing with bundle search
    '''
    reducedObjectSet = copy(inputObjectSet)
    objectDict = dict()
    for i in inputObjectSet:
        objectDict[i.id] = i
    distanceMatrix = cluster_util.create_distance_matrix(inputObjectSet)
    dbtimestart = time()
    clusterCandidates = clustercost(
        dbscan(inputObjectSet, distanceMatrix, objectDict), objectDict)
    dbtimestop = time()
    print "dbscan time: \t\t\t", dbtimestop - dbtimestart
    #    print 'clustercandidates',clusterCandidates

    innerLines = []
    #search for lines inside large clusters
    if params.attempt_dnc == True:
        insideLineStart = time()
        for cluster in clusterCandidates[1]:

            innerObjects = []
            for id in cluster:
                for x in inputObjectSet:
                    if x.id == id:
                        innerObjects.append(x)
            innerChains = findChains(innerObjects, params)
            for thing in innerChains:
                innerLines.append(thing)

        #remove core clusters
        for cluster in clusterCandidates[0]:
            for id in cluster:
                for x in reducedObjectSet:
                    if x.id == id:
                        reducedObjectSet.remove(x)
        ReducedDistanceMatrix = cluster_util.create_distance_matrix(
            reducedObjectSet)
        insideLineStop = time()
        print "inside linesearch time:\t\t", insideLineStop - insideLineStart

    outsideLineStart = time()
    lineCandidates = findChains(reducedObjectSet, params)
    outsideLineStop = time()

    #    for i in scene:
    #        groups.append(cluster_util.SingletonBundle([i[0]],1))

    #need to implement singletons intelligently.

    print "general linesearch time:\t", outsideLineStop - outsideLineStart
    allCandidates = clusterCandidates[0] + clusterCandidates[
        1] + lineCandidates + innerLines
    groupDictionary = dict()
    for i in allCandidates:
        groupDictionary[i.uuid] = i
    for i in inputObjectSet:
        groupDictionary[i.uuid] = cluster_util.SingletonBundle([i.id], 1,
                                                               i.uuid)
    bundleStart = time()
    evali = bundleSearch(inputObjectSet, allCandidates,
                         params.allow_intersection, params.beam_width)
    bundleStop = time()
    print "bundlesearch time: \t\t", bundleStop - bundleStart
    #find the things in evali that aren't in the dictionary ,and make a singleton group out of them, and add it to the output

    #what the heck am i doing here?
    physicalobjects = []

    for i in evali:
        try:
            physicalobjects.append(groupDictionary.get(i))
        except:
            print "not in dictionary"
    output = map(lambda x: groupDictionary.get(x), evali)

    #    print 'costs', map(lambda x: x.cost,output)
    return output

Beispiel #8

0

Datei anzeigen

Datei: SceneEval.py Projekt: colinwinslow/bolt

def sceneEval(inputObjectSet,params = ClusterParams(2,0.9,3,0.05,0.1,1,1,11,False)):
    
    '''
    find the clusters
    evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters
    evaluate the outside of clusters for lines
    concatenate the lists of clusters and lines
    evaluate the whole thing with bundle search
    '''
    print "*",inputObjectSet
    reducedObjectSet = copy(inputObjectSet)
    objectDict = dict()
    for i in inputObjectSet:
        objectDict[i.uuid]=i
    distanceMatrix = cluster_util.create_distance_matrix(inputObjectSet)
    dbtimestart = time()
    clusterCandidates = clustercost(dbscan(inputObjectSet,distanceMatrix,objectDict),objectDict)
    dbtimestop = time()
    print "dbscan time: \t\t\t", dbtimestop-dbtimestart
    
    innerLines = []
    #search for lines inside large clusters
    if params.attempt_dnc==True:
        insideLineStart= time()
        for cluster in clusterCandidates[1]:

            innerObjects = []
            for id in cluster:
                for x in inputObjectSet:
                    if x.id == id:
                        innerObjects.append(x)
            innerChains = findChains(innerObjects,params)
            for thing in innerChains:
                innerLines.append(thing)
            
        #remove core clusters
        for cluster in clusterCandidates[0]:
            for id in cluster:
                for x in reducedObjectSet:
                    if x.id == id:
                        reducedObjectSet.remove(x)
        ReducedDistanceMatrix = cluster_util.create_distance_matrix(reducedObjectSet)
        insideLineStop = time()
        print "inside linesearch time:\t\t",insideLineStop-insideLineStart
        
    outsideLineStart = time()
    lineCandidates = findChains(reducedObjectSet,params,objectDict)
    outsideLineStop = time()
    
    print "general linesearch time:\t",outsideLineStop-outsideLineStart
    allCandidates = clusterCandidates[0]+clusterCandidates[1] + lineCandidates + innerLines
    allClusters = clusterCandidates[0]+clusterCandidates[1]
    allLines = lineCandidates + innerLines
    groupDictionary = dict()
    for i in allCandidates:
        groupDictionary[i.uuid]=i
    for i in inputObjectSet:
        groupDictionary[i.uuid]=i
    lineBundleStart = time()
    bestLines = bundleSearch(inputObjectSet, allLines, params.allow_intersection, params.beam_width)   
    lineBundleStop = time()
    print "linebundle time: \t\t", lineBundleStop-lineBundleStart
    
    clusterBundleStart = time()
    bestClusters = bundleSearch(inputObjectSet, allClusters, params.allow_intersection, params.beam_width) 
    print params.allow_intersection
    clusterBundleStop = time()
    print "clusterbundle time: \t\t", clusterBundleStop-clusterBundleStart
    bundleStart = time()
    evali = [] 

    try:
         evali = evali + bestLines
    except: print "there aren't any lines."
    try:
         evali = evali + bestClusters
    except: print "there aren't any clusterss."
    print "evali",evali
    
    
    output = map(lambda x: groupDictionary.get(x),evali)
    bundleStop = time()
    print "bundlesearch cleanup time: \t",bundleStop-bundleStart
    print "output",output
    return output

Beispiel #9

0

Datei anzeigen

Datei: SceneEval.py Projekt: arebgun/bolt-configurations

def sceneEval(inputObjectSet,params = ClusterParams(2,0.9,3,0.05,0.1,1,0,11,False)):
    
    '''
    find the clusters
    evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters
    evaluate the outside of clusters for lines
    concatenate the lists of clusters and lines
    evaluate the whole thing with bundle search
    '''
    reducedObjectSet = copy(inputObjectSet)
    objectDict = dict()
    for i in inputObjectSet:
        objectDict[i.id]=i
    distanceMatrix = cluster_util.create_distance_matrix(inputObjectSet)
    dbtimestart = time()
    clusterCandidates = clustercost(dbscan(inputObjectSet,distanceMatrix,objectDict),objectDict)
    dbtimestop = time()
    print "dbscan time: \t\t\t", dbtimestop-dbtimestart
#    print 'clustercandidates',clusterCandidates
    
    innerLines = []
    #search for lines inside large clusters
    if params.attempt_dnc==True:
        insideLineStart= time()
        for cluster in clusterCandidates[1]:

            innerObjects = []
            for id in cluster:
                for x in inputObjectSet:
                    if x.id == id:
                        innerObjects.append(x)
            innerChains = findChains(innerObjects,params)
            for thing in innerChains:
                innerLines.append(thing)
            
        #remove core clusters
        for cluster in clusterCandidates[0]:
            for id in cluster:
                for x in reducedObjectSet:
                    if x.id == id:
                        reducedObjectSet.remove(x)
        ReducedDistanceMatrix = cluster_util.create_distance_matrix(reducedObjectSet)
        insideLineStop = time()
        print "inside linesearch time:\t\t",insideLineStop-insideLineStart
        
    outsideLineStart = time()
    lineCandidates = findChains(reducedObjectSet,params)
    outsideLineStop = time()
    
    

#    for i in scene:
#        groups.append(cluster_util.SingletonBundle([i[0]],1))

#need to implement singletons intelligently. 



    print "general linesearch time:\t",outsideLineStop-outsideLineStart
    allCandidates = clusterCandidates[0]+clusterCandidates[1] + lineCandidates + innerLines
    groupDictionary = dict()
    for i in allCandidates:
        groupDictionary[i.uuid]=i
    for i in inputObjectSet:
        groupDictionary[i.uuid]=cluster_util.SingletonBundle([i.id],1,i.uuid)
    bundleStart = time()
    evali = bundleSearch(inputObjectSet, allCandidates, params.allow_intersection, params.beam_width)   
    bundleStop = time()
    print "bundlesearch time: \t\t",bundleStop-bundleStart
    #find the things in evali that aren't in the dictionary ,and make a singleton group out of them, and add it to the output

    #what the heck am i doing here?
    physicalobjects = []

    for i in evali:
        try:
            physicalobjects.append(groupDictionary.get(i))
        except:
            print "not in dictionary"
    output = map(lambda x: groupDictionary.get(x),evali)

#    print 'costs', map(lambda x: x.cost,output)
    return output