Example #1
0
def main(): 
	'''
	Main function for PAM
	'''
	if len(sys.argv) != 3:
		print('Error: invalid number of parameters')
		return(1)

	# Get the parameters
	filePath = sys.argv[1]
	k = int(sys.argv[2])
	if debugEnabled == True:
		print('filePath: ', filePath)
		print('k: ', k)

	# Run PAM for europe.txt
	data = importData(filePath)
	if debugEnabled == True:
		for i in range(10):
			print('data=', data[i])

	# Add timing here
	startTime = time.time()
	best_cost, best_choice, best_medoids = kmedoids(data, k)
	endTime = time.time()

	print('best_time: ', endTime - startTime)
	print('best_cost: ', best_cost)
	print('best_choice: ', best_choice)
	print('best_medoids: ', best_medoids)
Example #2
0
def main():
    '''
	Main function for PAM
	'''
    if len(sys.argv) != 3:
        print('Error: invalid number of parameters')
        return (1)

    # Get the parameters
    filePath = sys.argv[1]
    k = int(sys.argv[2])
    if debugEnabled == True:
        print('filePath: ', filePath)
        print('k: ', k)

    # Run PAM for europe.txt
    data = importData(filePath)
    if debugEnabled == True:
        number = len(data)
        if number > 10:
            number = 10
        for i in range(number):
            print('data=', data[i])

    # Add timing here
    startTime = time.time()
    best_cost, best_choice, best_medoids = kmedoids(data, k)
    endTime = time.time()

    print('best_time: ', endTime - startTime)
    print('best_cost: ', best_cost)
    print('best_choice: ', best_choice)
    print('best_medoids: ', best_medoids)
Example #3
0
def main():
    '''
	Main function for Parallele PAM
	'''
    if len(sys.argv) != 4:
        print('Error: invalid number of parameters')
        return (1)

    # Get the parameters
    filePath = sys.argv[1]
    k = int(sys.argv[2])
    t = int(sys.argv[3])
    if debugEnabled == True:
        print('filePath: ', filePath)
        print('k: ', k)
        print('t: ', t)

    # Run PAM for europe.txt
    data = util.importData(filePath)
    if debugEnabled == True:
        for i in range(10):
            print('data=', data[i])

    # Check the timing
    startTime = time.time()
    best_cost, best_choice, best_medoids = kmedoids_parallel(data, k, t)
    endTime = time.time()

    print('best_time: ', endTime - startTime)
    print('best_cost: ', best_cost)
    print('best_choice: ', best_choice)
    print('best_medoids: ', best_medoids)
Example #4
0
def main():
    '''
    Main function for Clara
    '''
    print "sys.argv: ", sys.argv
    print "len(sys.argv): ", len(sys.argv)
    #if len(sys.argv) not in [4,6,7,8]:
    #    print('Error: invalid number of parameters. Your parameters should be: \n path_to_node_names  k  cost_type [path_to_distance_matrix [path_to_similarity_matrix]  path_to_edge_matrix_(affinity)]')
    #    return(1)

    if len(sys.argv) == 4 and sys.argv[3] == 3:
        print('Error: cost based on distance/similarity matrix without the matrix specified')
        return(1)

    # Get the parameters
    try:
        splittedSim = sys.argv[8]
    except:
        splittedSim = False
        print "no value for splittedSim parameter specified, so it is set to False"
    filePath = sys.argv[1]
    k = int(sys.argv[2])
    COST = int(sys.argv[3])
    if len(sys.argv) >= 6:
        if COST == 3:
            distDictPath = sys.argv[4]
            affinityPath = sys.argv[5]
            print "distDictPath: ", distDictPath
            print "affinityPath: ", affinityPath
        elif COST == 4:
            distDictPath = sys.argv[4]
            simDictPath = sys.argv[5]
            affinityPath = sys.argv[6]
            try:
                acceleration = int(sys.argv[7])
                print "acceleration degree: ", acceleration
            except: acceleration = 0
            print "distDictPath: ", distDictPath
            print "simDictPath: ", simDictPath
            print "affinityPath: ", affinityPath
        else:
            print "Error: I dunno if you pass 'similarity' (COST=4) or you pass 'distance matrix' (COST=3)"
            return(1)
        #affinityPath = sys.argv[5]

    if debugEnabled == True:
        print 'filePath: ', filePath
        print 'k: ', k
        print "Cost Function: ", COST

    # Run Clara
    if COST in [3,4]:
        affinitiesOur = importData(affinityPath, ifjson=1)
        print "\n affinities imported"
        data = importData(filePath, ifjson=1)
        distDictOur = importData(distDictPath, ifjson=1)
        #distDictOur = None
        print "\n pairwise distances imported"
        if COST == 4:
            #simDictOur = importData(simDictPath, ifjson=1)
            if not splittedSim:
                simDictOur = {}
                with open(simDictPath, 'r') as S:
                    for line in S:
                        simDictOur.update(json.loads(line))
            else:
                simDictOur = load_splitted_sim(simDictPath, coresLoaded)
            print "\n pairwise similarities imported\n"
    else:
        data = importData(filePath)
    if debugEnabled == True:
        for i in range(10):
            print('example_data=', data[i])

    # Add timing
    startTime = time.time()
    if COST == 3:
        best_cost, best_choice, best_medoids, cost_list, isolates = clara(
            data, k, COST, distDictClara=distDictOur, simDictClara={},
            affinities=affinitiesOur, saveAllResults=False, acceleration=acceleration,
            take_all_nodes=takeAllNodes)
    elif COST == 4:
        best_cost, best_choice, best_medoids, cost_list, isolates = clara(
            data, k, COST, distDictClara=distDictOur, simDictClara=simDictOur,
            affinities=affinitiesOur, saveAllResults=False, acceleration=acceleration,
            take_all_nodes=takeAllNodes)
    else:
        best_cost, best_choice, best_medoids, cost_list, isolates = clara(
            data, k, COST, saveAllResults=False,
            take_all_nodes=takeAllNodes)
    endTime = time.time()

    mod = None
    harmonic_centrality = None
    '''
    #don't delete!!!!!!!!!!!!~~~~~
    # Compute modularity and display it
    startMod = time.time()
    if COST in [3,4]:
        mod = modularity(data, COST=COST, distDict=distDictOur,
                         edgeDict=affinitiesOur, medoids=best_medoids)
    else:
        print "no modularity for this regime, sorry"
        pass
    endMod = time.time()
    # comment up to this ~~~~~~~~~~

    startCentr = time.time()
    harmonic_centrality =  intra_cluster_centrality(data, COST=COST,
                            distDict=distDictOur, medoids=best_medoids)
    endCentr = time.time()
    '''
    # Save the result
    fordump = [best_cost, best_choice, best_medoids, mod, harmonic_centrality, isolates]
    json_filename = "clara_json_version_" + str(int(time.time())/10)[3:] + "_one" #time.strftime("%d %b %Y %H:%M:%S", time.gmtime())
    json.dump(fordump, open(json_filename, "w"))

    # Print the result
    diff = (endTime - startTime)
    best_cluster_length = []
    for i in best_choice:
        best_cluster_length.append(len(best_medoids[i]))

    print '\n\n'
    print 'best_time: ', diff
    #print 'best_modularity_time: ', endMod - startMod
    #print 'best_centrality_time: ', endCentr - startCentr
    #print 'best_modularity: ', mod
    print 'best_cost: ', best_cost
    print 'best_choice: ', best_choice
    print 'best_cluster_lengths: ', best_cluster_length
    print 'clustered_nodes: ', sum(best_cluster_length)
    #print 'all_nodes: ', len(data)
    print 'isolates: ', len(isolates), isolates
    print '\n\n'
    print 'best_medoids: ', best_medoids
    print '\n\n'
Example #5
0
def main():
    '''
    Main function for Clara
    '''
    print "sys.argv: ", sys.argv
    print "len(sys.argv): ", len(sys.argv)
    #if len(sys.argv) not in [4,6,7,8]:
    #    print('Error: invalid number of parameters. Your parameters should be: \n path_to_node_names  k  cost_type [path_to_distance_matrix [path_to_similarity_matrix]  path_to_edge_matrix_(affinity)]')
    #    return(1)

    if len(sys.argv) == 4 and sys.argv[3] == 3:
        print(
            'Error: cost based on distance/similarity matrix without the matrix specified'
        )
        return (1)

    # Get the parameters
    try:
        splittedSim = sys.argv[8]
    except:
        splittedSim = False
        print "no value for splittedSim parameter specified, so it is set to False"
    filePath = sys.argv[1]
    k = int(sys.argv[2])
    COST = int(sys.argv[3])
    if len(sys.argv) >= 6:
        if COST == 3:
            distDictPath = sys.argv[4]
            affinityPath = sys.argv[5]
            print "distDictPath: ", distDictPath
            print "affinityPath: ", affinityPath
        elif COST == 4:
            distDictPath = sys.argv[4]
            simDictPath = sys.argv[5]
            affinityPath = sys.argv[6]
            try:
                acceleration = int(sys.argv[7])
                print "acceleration degree: ", acceleration
            except:
                acceleration = 0
            print "distDictPath: ", distDictPath
            print "simDictPath: ", simDictPath
            print "affinityPath: ", affinityPath
        else:
            print "Error: I dunno if you pass 'similarity' (COST=4) or you pass 'distance matrix' (COST=3)"
            return (1)
        #affinityPath = sys.argv[5]

    if debugEnabled == True:
        print 'filePath: ', filePath
        print 'k: ', k
        print "Cost Function: ", COST

    # Run Clara
    if COST in [3, 4]:
        affinitiesOur = importData(affinityPath, ifjson=1)
        print "\n affinities imported"
        data = importData(filePath, ifjson=1)
        distDictOur = importData(distDictPath, ifjson=1)
        #distDictOur = None
        print "\n pairwise distances imported"
        if COST == 4:
            #simDictOur = importData(simDictPath, ifjson=1)
            if not splittedSim:
                simDictOur = {}
                with open(simDictPath, 'r') as S:
                    for line in S:
                        simDictOur.update(json.loads(line))
            else:
                simDictOur = load_splitted_sim(simDictPath, coresLoaded)
            print "\n pairwise similarities imported\n"
    else:
        data = importData(filePath)
    if debugEnabled == True:
        for i in range(10):
            print('example_data=', data[i])

    # Add timing
    startTime = time.time()
    if COST == 3:
        best_cost, best_choice, best_medoids, cost_list, isolates = clara(
            data,
            k,
            COST,
            distDictClara=distDictOur,
            simDictClara={},
            affinities=affinitiesOur,
            saveAllResults=False,
            acceleration=acceleration,
            take_all_nodes=takeAllNodes)
    elif COST == 4:
        best_cost, best_choice, best_medoids, cost_list, isolates = clara(
            data,
            k,
            COST,
            distDictClara=distDictOur,
            simDictClara=simDictOur,
            affinities=affinitiesOur,
            saveAllResults=False,
            acceleration=acceleration,
            take_all_nodes=takeAllNodes)
    else:
        best_cost, best_choice, best_medoids, cost_list, isolates = clara(
            data, k, COST, saveAllResults=False, take_all_nodes=takeAllNodes)
    endTime = time.time()

    mod = None
    harmonic_centrality = None
    '''
    #don't delete!!!!!!!!!!!!~~~~~
    # Compute modularity and display it
    startMod = time.time()
    if COST in [3,4]:
        mod = modularity(data, COST=COST, distDict=distDictOur,
                         edgeDict=affinitiesOur, medoids=best_medoids)
    else:
        print "no modularity for this regime, sorry"
        pass
    endMod = time.time()
    # comment up to this ~~~~~~~~~~

    startCentr = time.time()
    harmonic_centrality =  intra_cluster_centrality(data, COST=COST,
                            distDict=distDictOur, medoids=best_medoids)
    endCentr = time.time()
    '''
    # Save the result
    fordump = [
        best_cost, best_choice, best_medoids, mod, harmonic_centrality,
        isolates
    ]
    json_filename = "clara_json_version_" + str(
        int(time.time()) /
        10)[3:] + "_one"  #time.strftime("%d %b %Y %H:%M:%S", time.gmtime())
    json.dump(fordump, open(json_filename, "w"))

    # Print the result
    diff = (endTime - startTime)
    best_cluster_length = []
    for i in best_choice:
        best_cluster_length.append(len(best_medoids[i]))

    print '\n\n'
    print 'best_time: ', diff
    #print 'best_modularity_time: ', endMod - startMod
    #print 'best_centrality_time: ', endCentr - startCentr
    #print 'best_modularity: ', mod
    print 'best_cost: ', best_cost
    print 'best_choice: ', best_choice
    print 'best_cluster_lengths: ', best_cluster_length
    print 'clustered_nodes: ', sum(best_cluster_length)
    #print 'all_nodes: ', len(data)
    print 'isolates: ', len(isolates), isolates
    print '\n\n'
    print 'best_medoids: ', best_medoids
    print '\n\n'
Example #6
0
def main():
    '''
    Main function for PAM
    '''
    print sys.argv
    print len(sys.argv)
    if len(sys.argv) == 4 and sys.argv[3] == 3:
        print 'Error: cost based on distance matrix without distance matrix specified'
        return (1)

    if len(sys.argv) not in [4, 5]:
        print 'Error: invalid number of parameters. Your parameters should be: \n path_to_node_names  k  cost_type  [pairwise_matrix_(distance_or_similarity)]'
        return (1)

    # Get the parameters
    filePath = sys.argv[1]
    k = int(sys.argv[2])
    COST = int(sys.argv[3]
               )  # here it's obligatory, but in kmedoids(), optional. FIX IT
    if len(sys.argv) == 5:
        if COST == 4:  # k-medoids based on similarity matrix (e.g Jaccard score)
            simDictPath = sys.argv[4]
            print simDictPath
        elif COST == 3:  # k-medoids based on distance matrix (e.g. Average shortest path)
            distDictPath = sys.argv[4]
            print distDictPath  # distDictPath is not a file - it's a path to file (string)
        else:
            print "Error: I dunno whether you pass affinities to compute 'similarity' (COST=4) or you pass 'distance matrix' (COST=3)"
            return (1)

    if debugEnabled == True:
        print('filePath: ', filePath)
        print('k: ', k)
        print('cost function number: ', COST
              )  # better yet, display the name, not number FIX IT

    # Run PAM for europe.txt
    distDictOur = {}
    simDictOur = {}
    if COST == 3:
        data = importData(
            filePath, ifjson=1
        )  # actually, ifjson=1 is not tantamount to  direct distance method FIX IT
        distDictOur = importData(distDictPath, ifjson=1)
        # print "distance: ", distDict.items()[0]
    elif COST == 4:
        data = importData(filePath, ifjson=1)
        simDictOur = importData(simDictPath, ifjson=1)
        print "pairwise similarities imported"
    else:
        data = importData(filePath)
    if debugEnabled:
        for i in range(10):
            print('data=', data[i])

    # Add timing here
    startTime = time.time()
    if COST not in [3, 4]:
        best_cost, best_choice, best_medoids = kmedoids(data, k, COST)
    elif COST == 3:
        best_cost, best_choice, best_medoids = kmedoids(data,
                                                        k,
                                                        COST,
                                                        distDictOur,
                                                        simDictKM={})
    elif COST == 4:
        best_cost, best_choice, best_medoids = kmedoids(data,
                                                        k,
                                                        COST,
                                                        distDictKM={},
                                                        simDictKM=simDictOur)
    endTime = time.time()

    # Saving the result into new file
    fordump = [best_cost, best_choice, best_medoids]
    json_filename = "pam_json_version " + str(
        int(time.time()) / 10)[3:]  # find normal time format FIX IT
    json.dump(fordump, open(json_filename, "w"))

    best_cluster_length = []
    for i in best_choice:
        try:
            best_cluster_length.append(len(best_medoids[i]))
        except KeyError:
            best_cluster_length.append("KeyError ;)")

    print 'best_time: ', (endTime - startTime)
    print 'best_cost: ', best_cost
    print 'best_choice: \n', best_choice
    print 'best_cluster_lengths: \n', best_cluster_length
    print 'best_cluster_contents: \n', best_medoids
Example #7
0
def main():
    # parse arguments
    args = read_arguments()

    data = []
    affinities = {}
    simDict = {}

    if args.distDict != None:
        distDict = importData(args.distDict, ifjson=1)
        print "\n pairwise distances imported"
    else:
        distDict = {}

    if not args.loadedCores:
        data = importData(args.data, ifjson=1)
        affinities = importData(args.edgeDict, ifjson=1)
        print "\n affinities imported"
        simDict = importData(args.simDict, ifjson=1)
        print "\n pairwise similarities imported\n"

    #~~~~~~~~~~~~ method for finding the optimal K ~~~~~~~~~~~#
    if args.method == 'kopt':
        if args.trials_decay: trials_decay = True
        else: trials_decay = False

        if args.KlastSeq:
            if args.Kupper:
                Klist = range(1, args.KlastSeq + 1) + args.Kupper
            else:
                Klist = range(1, args.KlastSeq + 1)
                Klist.append(args.Kmax)
        else:
            Klist = range(1, args.Kmax + 1)

        startTime = time.time()
        Kopt, mod_lists, modStdev, modMean, modMax = ModularityProfile(
            data,
            Kmin=args.Kmin,
            Kmax=args.Kmax,
            Klist=Klist,
            edgeDict=affinities,
            simDict=simDict,
            bagSize=args.bagSize,
            trials=args.trials,
            trials_decay=trials_decay)
        endTime = time.time()

        fordump = [Kopt, mod_lists, modStdev, modMean, modMax]
        json_filename = "kopt_json_version_" + Version
        json.dump(fordump, open(json_filename, "w"))

        print '\n\n'
        print 'saved as: ', json_filename
        print 'time: ', endTime - startTime
        print 'optimal K: ', Kopt
        print 'modMax: ', modMax
        for i in xrange(len(Klist)):
            print "mod_list for k=%d is:  %s" % (Klist[i], mod_lists[i])
        print '\n\n'

    #~~~~~~~~~~~ method for averaging clustering results ~~~~~~~~~~~#
    if args.method == "cores":
        print "gotcha"
        result = SGJRIcores(data=data,
                            K=args.K,
                            edgeDict=affinities,
                            simDict=simDict,
                            distDict=distDict,
                            bagSize=args.bagSize,
                            trials=args.trials,
                            threshold=args.threshold,
                            loadedCores=args.loadedCores,
                            filepathCores=args.filepathCores,
                            dendroFormat=args.dendroFormat,
                            acceleration=int(args.acceleration))

        if 0 == 0:  #args.distDict != None:
            clustData, commonCluster, treeStruct, segmentDict, harmCentr = result[:
                                                                                  -1]
            mongo = result[-1]
            if args.saveCommonClusters:
                fordump = [
                    clustData, commonCluster, treeStruct, segmentDict,
                    harmCentr, mongo
                ]
            else:
                fordump = [
                    clustData, ['commonCluster_placeholder'], treeStruct,
                    segmentDict, harmCentr, mongo
                ]
        else:
            clustData, commonCluster, treeStruct, segmentDict = result
            fordump = [clustData, commonCluster, treeStruct, segmentDict]

        pkl_filename = "cores_pickled_version_" + Version
        with open(pkl_filename, 'w') as f:
            pickle.dump(fordump, f, pickle.HIGHEST_PROTOCOL)

        print "Saved as: ", pkl_filename
Example #8
0
def main():
    '''
    Main function for PAM
    '''
    print sys.argv
    print len(sys.argv)
    if len(sys.argv) == 4 and sys.argv[3] == 3:
        print 'Error: cost based on distance matrix without distance matrix specified'
        return (1)

    if len(sys.argv) not in [4,5]:
        print 'Error: invalid number of parameters. Your parameters should be: \n path_to_node_names  k  cost_type  [pairwise_matrix_(distance_or_similarity)]'
        return (1)

    # Get the parameters
    filePath = sys.argv[1]
    k = int(sys.argv[2])
    COST = int(sys.argv[3])  # here it's obligatory, but in kmedoids(), optional. FIX IT
    if len(sys.argv) == 5:
        if COST == 4:   # k-medoids based on similarity matrix (e.g Jaccard score)
            simDictPath = sys.argv[4]
            print simDictPath
        elif COST == 3:   # k-medoids based on distance matrix (e.g. Average shortest path)
            distDictPath = sys.argv[4]
            print distDictPath # distDictPath is not a file - it's a path to file (string)
        else:
            print "Error: I dunno whether you pass affinities to compute 'similarity' (COST=4) or you pass 'distance matrix' (COST=3)"
            return(1)

    if debugEnabled == True:
        print('filePath: ', filePath)
        print('k: ', k)
        print('cost function number: ', COST)  # better yet, display the name, not number FIX IT

    # Run PAM for europe.txt
    distDictOur = {}
    simDictOur = {}
    if COST == 3:
        data = importData(filePath, ifjson=1)  # actually, ifjson=1 is not tantamount to  direct distance method FIX IT
        distDictOur = importData(distDictPath, ifjson=1)
        # print "distance: ", distDict.items()[0]
    elif COST == 4:
        data = importData(filePath, ifjson=1)
        simDictOur = importData(simDictPath, ifjson=1)
        print "pairwise similarities imported"
    else:
        data = importData(filePath)
    if debugEnabled:
        for i in range(10):
            print('data=', data[i])

    # Add timing here
    startTime = time.time()
    if COST not in [3,4]:
        best_cost, best_choice, best_medoids = kmedoids(data, k, COST)
    elif COST == 3:
        best_cost, best_choice, best_medoids = kmedoids(data, k, COST, distDictOur, simDictKM={})
    elif COST == 4:
        best_cost, best_choice, best_medoids = kmedoids(data, k, COST, distDictKM={}, simDictKM=simDictOur)
    endTime = time.time()

    # Saving the result into new file
    fordump = [best_cost, best_choice, best_medoids]
    json_filename = "pam_json_version " + str(int(time.time())/10)[3:]    # find normal time format FIX IT
    json.dump(fordump, open(json_filename, "w"))

    best_cluster_length = []
    for i in best_choice:
        try:
            best_cluster_length.append(len(best_medoids[i]))
        except KeyError:
            best_cluster_length.append("KeyError ;)")

    print 'best_time: ', (endTime - startTime)
    print 'best_cost: ', best_cost
    print 'best_choice: \n', best_choice
    print 'best_cluster_lengths: \n', best_cluster_length
    print 'best_cluster_contents: \n', best_medoids
from util import importData
from util import CLASS_MAP
import pandas as pd

# Datasets for machine learning tasks
#---------------------------------------------------------------------------------------------------------------------

#Import and process raw data
df = importData("data-raw/winemag-data_first150k.csv",
                censor=True,
                filter=True,
                processDescriptions=True)

#Keep only necessary columns
df_keep = df[['index', 'color', 'class', 'description']]

#Parameters to subset data
num_test = 20000
num_dev = 20000
num_train = df_keep.shape[0] - num_test - num_dev
assert num_train > 0

#Subset Data
data = df_keep.sample(frac=1, replace=False,
                      random_state=1415926).reset_index(drop=True)
data_test = data.iloc[0:num_test]
data_dev = data.iloc[num_test:(num_test + num_dev)]
data_train = data.iloc[(num_test + num_dev):(num_test + num_dev + num_train)]

#Save to disk
data_test.to_csv('data-processed/data.test', index=False)