def main():
    train, test = load_train_and_test()
    xlim = [-74.03, -73.77]
    ylim = [40.6, 40.9]

    train_copy = train[["pickup_longitude", "dropoff_longitude", "pickup_latitude", "dropoff_latitude"]].copy()

    train_copy = train_copy[(train_copy.pickup_longitude >= xlim[0]) & (train_copy.pickup_longitude <= xlim[1])]
    train_copy = train_copy[(train_copy.dropoff_longitude >= xlim[0]) & (train_copy.dropoff_longitude <= xlim[1])]
    train_copy = train_copy[(train_copy.pickup_latitude >= ylim[0]) & (train_copy.pickup_latitude <= ylim[1])]
    train_copy = train_copy[(train_copy.dropoff_latitude >= ylim[0]) & (train_copy.dropoff_latitude <= ylim[1])]

    test_copy = test[["pickup_longitude", "dropoff_longitude", "pickup_latitude", "dropoff_latitude"]].copy()

    test_copy = test_copy[(test_copy.pickup_longitude >= xlim[0]) & (test_copy.pickup_longitude <= xlim[1])]
    test_copy = test_copy[(test_copy.dropoff_longitude >= xlim[0]) & (test_copy.dropoff_longitude <= xlim[1])]
    test_copy = test_copy[(test_copy.pickup_latitude >= ylim[0]) & (test_copy.pickup_latitude <= ylim[1])]
    test_copy = test_copy[(test_copy.dropoff_latitude >= ylim[0]) & (test_copy.dropoff_latitude <= ylim[1])]

    train_lons = list(train_copy.pickup_longitude) + list(train_copy.dropoff_longitude)
    train_lats = list(train_copy.pickup_latitude) + list(train_copy.dropoff_latitude)

    test_lons = list(test_copy.pickup_longitude) + list(test_copy.dropoff_longitude)
    test_lats = list(test_copy.pickup_latitude) + list(test_copy.dropoff_latitude)

    loc_df = pd.DataFrame()
    loc_df["lons"] = train_lons * (10 ** 3)
    loc_df["lats"] = train_lats * (10 ** 3)

    loc_df = loc_df.sample(50000)
    dbscan(loc_df, eps=1, min_samples=100, save_path="../data/CLUSTER/")
Beispiel #2
def do_work(work_queue, output):
    while 1:
            q = work_queue.get(block=False)
            # data
            print q['user']
            t0 = time()
            (segments, outlierDict) = readSegments(q['record'])

            #classifier_results = classifier.classify(data, **q['classifier_parameters']['parameters'])
            res = cluster.dbscan(segments)

            resProto = cluster.prototypeSelection(segments, res)

            # save
            out = {
                'centroids': resProto["centroids"],
                'nsamples_in_cluster': resProto["nsamples_in_cluster"],
                'clusters': res["clusters"]

            filename = q['out_dir'] + q['record'] + "-prototype-temp.pkl"

            fd =, 'wb')
            cPickle.dump(out, fd)
            print "done in %0.3fs" % (time() - t0)

        except Empty:
Beispiel #3
def customClusterQuestions(docs, algorithm, parameters, removeOutliers=True):
    Customized question clustering methods. Implemented as a knock-off to prevent interupting with production builds

        docs (list): List of strings (questions)
        algorithm (str):  The clustering algorithm that will be used
            (DBSCAN, K Means, Gaussian K Means, Agglomerative Clustering)
        parameters (string): Parameters used to initiate the clustering algorithms
            (epsilon, clusters, strictness, threshold)
        removeOutliers (bool): A flag to determine whether to remove outliers or not
            (default is True)

        corpus: Corpus that has clusters list attached to it

    corpus = tagAndVectorizeCorpus(docs)
    params = [{param["param"]: param["value"]} for param in parameters][0]
    if algorithm == "DBSCAN":
        return cluster.dbscan(corpus, float(params["epsilon"]))

    if algorithm == "K Means":
        return cluster.kmeans(corpus, int(params["clusters"]), removeOutliers)

    if algorithm == "Gaussian K Means":
        return cluster.g_kmeans(corpus, min(4, int(params["strictness"])), removeOutliers)

    if algorithm == "Agglomerative Clustering":
        return cluster.agglomerate(corpus, float(params["threshold"]), removeOutliers)
Beispiel #4
 def generate_clusteredpoints(filepath, methodType, dataIndex, recordList):
     oripoints = processtool.read_points_fromfile(filepath)
     points = processtool.generate_plist(oripoints)
     if methodType == 'kmeans':
         kmeansfactory = kmeans(__kmeansk__[dataIndex], points)
         return kmeansfactory.points
     elif methodType == 'dbscan':
         #dbscanfactory = dbscan(1.1, 5, points)
         dbscanfactory = dbscan(epsSelect[dataIndex], MinPtsSelect[dataIndex], points)
         return dbscanfactory.points
def sceneEval(inputObjectSet,params = ClusterParams(2,0.9,3,0.05,0.1,1,0,10)):
    find the clusters
    evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters
    evaluate the outside of clusters for lines
    concatenate the lists of clusters and lines
    evaluate the whole thing with bundle search
    clusterCandidates = clustercost(dbscan(np.array(map(lambda x: (x.position,,inputObjectSet))))
    lineCandidates = findChains(inputObjectSet,params)
    allCandidates = clusterCandidates + lineCandidates
    evali = bundleSearch(cluster_util.totuple(inputObjectSet), allCandidates, params.allow_intersection, 10)
    print evali
    return evali
Beispiel #6
def sceneEval(inputObjectSet,params = ClusterParams(2,0.9,3,0.05,0.1,1,0,11,False)):
    find the clusters
    evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters
    evaluate the outside of clusters for lines
    concatenate the lists of clusters and lines
    evaluate the whole thing with bundle search
    reducedObjectSet = copy(inputObjectSet)
    clusterCandidates = clustercost(dbscan(np.array(map(lambda x: (x.position,,inputObjectSet))))
    innerLines = []
    #search for lines inside large clusters
    if params.attempt_dnc==True:
        for cluster in clusterCandidates[1]:

            innerObjects = []
            for id in cluster:
                for x in inputObjectSet:
                    if == id:
            innerChains = findChains(innerObjects,params)
            for thing in innerChains:
        #remove core clusters
        for cluster in clusterCandidates[0]:
            for id in cluster:
                for x in reducedObjectSet:
                    if == id:

    lineCandidates = findChains(reducedObjectSet,params)
    allCandidates = clusterCandidates[0]+clusterCandidates[1] + lineCandidates + innerLines
    groupDictionary = dict()
    for i in allCandidates:
    evali = bundleSearch(cluster_util.totuple(inputObjectSet), allCandidates, params.allow_intersection, params.beam_width)   
    #find the things in evali that aren't in the dictionary ,and make a singleton group out of them, and add it to the output
    output = map(lambda x: groupDictionary.get(x),evali)
    return output
def sceneEval(inputObjectSet,
              params=ClusterParams(2, 0.9, 3, 0.05, 0.1, 1, 0, 11, False)):
    find the clusters
    evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters
    evaluate the outside of clusters for lines
    concatenate the lists of clusters and lines
    evaluate the whole thing with bundle search
    reducedObjectSet = copy(inputObjectSet)
    objectDict = dict()
    for i in inputObjectSet:
        objectDict[] = i
    distanceMatrix = cluster_util.create_distance_matrix(inputObjectSet)
    dbtimestart = time()
    clusterCandidates = clustercost(
        dbscan(inputObjectSet, distanceMatrix, objectDict), objectDict)
    dbtimestop = time()
    print "dbscan time: \t\t\t", dbtimestop - dbtimestart
    #    print 'clustercandidates',clusterCandidates

    innerLines = []
    #search for lines inside large clusters
    if params.attempt_dnc == True:
        insideLineStart = time()
        for cluster in clusterCandidates[1]:

            innerObjects = []
            for id in cluster:
                for x in inputObjectSet:
                    if == id:
            innerChains = findChains(innerObjects, params)
            for thing in innerChains:

        #remove core clusters
        for cluster in clusterCandidates[0]:
            for id in cluster:
                for x in reducedObjectSet:
                    if == id:
        ReducedDistanceMatrix = cluster_util.create_distance_matrix(
        insideLineStop = time()
        print "inside linesearch time:\t\t", insideLineStop - insideLineStart

    outsideLineStart = time()
    lineCandidates = findChains(reducedObjectSet, params)
    outsideLineStop = time()

    #    for i in scene:
    #        groups.append(cluster_util.SingletonBundle([i[0]],1))

    #need to implement singletons intelligently.

    print "general linesearch time:\t", outsideLineStop - outsideLineStart
    allCandidates = clusterCandidates[0] + clusterCandidates[
        1] + lineCandidates + innerLines
    groupDictionary = dict()
    for i in allCandidates:
        groupDictionary[i.uuid] = i
    for i in inputObjectSet:
        groupDictionary[i.uuid] = cluster_util.SingletonBundle([], 1,
    bundleStart = time()
    evali = bundleSearch(inputObjectSet, allCandidates,
                         params.allow_intersection, params.beam_width)
    bundleStop = time()
    print "bundlesearch time: \t\t", bundleStop - bundleStart
    #find the things in evali that aren't in the dictionary ,and make a singleton group out of them, and add it to the output

    #what the heck am i doing here?
    physicalobjects = []

    for i in evali:
            print "not in dictionary"
    output = map(lambda x: groupDictionary.get(x), evali)

    #    print 'costs', map(lambda x: x.cost,output)
    return output
Beispiel #8
def sceneEval(inputObjectSet,params = ClusterParams(2,0.9,3,0.05,0.1,1,1,11,False)):
    find the clusters
    evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters
    evaluate the outside of clusters for lines
    concatenate the lists of clusters and lines
    evaluate the whole thing with bundle search
    print "*",inputObjectSet
    reducedObjectSet = copy(inputObjectSet)
    objectDict = dict()
    for i in inputObjectSet:
    distanceMatrix = cluster_util.create_distance_matrix(inputObjectSet)
    dbtimestart = time()
    clusterCandidates = clustercost(dbscan(inputObjectSet,distanceMatrix,objectDict),objectDict)
    dbtimestop = time()
    print "dbscan time: \t\t\t", dbtimestop-dbtimestart
    innerLines = []
    #search for lines inside large clusters
    if params.attempt_dnc==True:
        insideLineStart= time()
        for cluster in clusterCandidates[1]:

            innerObjects = []
            for id in cluster:
                for x in inputObjectSet:
                    if == id:
            innerChains = findChains(innerObjects,params)
            for thing in innerChains:
        #remove core clusters
        for cluster in clusterCandidates[0]:
            for id in cluster:
                for x in reducedObjectSet:
                    if == id:
        ReducedDistanceMatrix = cluster_util.create_distance_matrix(reducedObjectSet)
        insideLineStop = time()
        print "inside linesearch time:\t\t",insideLineStop-insideLineStart
    outsideLineStart = time()
    lineCandidates = findChains(reducedObjectSet,params,objectDict)
    outsideLineStop = time()
    print "general linesearch time:\t",outsideLineStop-outsideLineStart
    allCandidates = clusterCandidates[0]+clusterCandidates[1] + lineCandidates + innerLines
    allClusters = clusterCandidates[0]+clusterCandidates[1]
    allLines = lineCandidates + innerLines
    groupDictionary = dict()
    for i in allCandidates:
    for i in inputObjectSet:
    lineBundleStart = time()
    bestLines = bundleSearch(inputObjectSet, allLines, params.allow_intersection, params.beam_width)   
    lineBundleStop = time()
    print "linebundle time: \t\t", lineBundleStop-lineBundleStart
    clusterBundleStart = time()
    bestClusters = bundleSearch(inputObjectSet, allClusters, params.allow_intersection, params.beam_width) 
    print params.allow_intersection
    clusterBundleStop = time()
    print "clusterbundle time: \t\t", clusterBundleStop-clusterBundleStart
    bundleStart = time()
    evali = [] 

         evali = evali + bestLines
    except: print "there aren't any lines."
         evali = evali + bestClusters
    except: print "there aren't any clusterss."
    print "evali",evali
    output = map(lambda x: groupDictionary.get(x),evali)
    bundleStop = time()
    print "bundlesearch cleanup time: \t",bundleStop-bundleStart
    print "output",output
    return output
def sceneEval(inputObjectSet,params = ClusterParams(2,0.9,3,0.05,0.1,1,0,11,False)):
    find the clusters
    evaulate the inside of the clusters as lines to see if they'd be better as lines than clusters
    evaluate the outside of clusters for lines
    concatenate the lists of clusters and lines
    evaluate the whole thing with bundle search
    reducedObjectSet = copy(inputObjectSet)
    objectDict = dict()
    for i in inputObjectSet:
    distanceMatrix = cluster_util.create_distance_matrix(inputObjectSet)
    dbtimestart = time()
    clusterCandidates = clustercost(dbscan(inputObjectSet,distanceMatrix,objectDict),objectDict)
    dbtimestop = time()
    print "dbscan time: \t\t\t", dbtimestop-dbtimestart
#    print 'clustercandidates',clusterCandidates
    innerLines = []
    #search for lines inside large clusters
    if params.attempt_dnc==True:
        insideLineStart= time()
        for cluster in clusterCandidates[1]:

            innerObjects = []
            for id in cluster:
                for x in inputObjectSet:
                    if == id:
            innerChains = findChains(innerObjects,params)
            for thing in innerChains:
        #remove core clusters
        for cluster in clusterCandidates[0]:
            for id in cluster:
                for x in reducedObjectSet:
                    if == id:
        ReducedDistanceMatrix = cluster_util.create_distance_matrix(reducedObjectSet)
        insideLineStop = time()
        print "inside linesearch time:\t\t",insideLineStop-insideLineStart
    outsideLineStart = time()
    lineCandidates = findChains(reducedObjectSet,params)
    outsideLineStop = time()

#    for i in scene:
#        groups.append(cluster_util.SingletonBundle([i[0]],1))

#need to implement singletons intelligently. 

    print "general linesearch time:\t",outsideLineStop-outsideLineStart
    allCandidates = clusterCandidates[0]+clusterCandidates[1] + lineCandidates + innerLines
    groupDictionary = dict()
    for i in allCandidates:
    for i in inputObjectSet:
    bundleStart = time()
    evali = bundleSearch(inputObjectSet, allCandidates, params.allow_intersection, params.beam_width)   
    bundleStop = time()
    print "bundlesearch time: \t\t",bundleStop-bundleStart
    #find the things in evali that aren't in the dictionary ,and make a singleton group out of them, and add it to the output

    #what the heck am i doing here?
    physicalobjects = []

    for i in evali:
            print "not in dictionary"
    output = map(lambda x: groupDictionary.get(x),evali)

#    print 'costs', map(lambda x: x.cost,output)
    return output