Ejemplo n.º 1
0
def evaluate(datos, ponderaciones, silencios, carga_computacional=1,tipo_clustering="KMEANS", ncluster = 3, metrica = "SIL"):
    '''
    Dado un clasificador y un conjunto de train y test devuelve su tasa de acierto en test.
    '''
    global light_segs
    
    fecha = False
    if len(ponderaciones) == 0:
        return 0.0
    ligero = np.sign(carga_computacional) < 0
    carga_computacional = abs(carga_computacional)
    
    if carga_computacional < 1:
        datos = datos.sample(int(datos.shape[0] * carga_computacional))
        datos = datos.sort_index()
        carga_computacional = 1
        fecha = True
        
    if carga_computacional>=1:
        if not ligero:
            segmentos, _ = sg.segmentate_data_frame(df=datos, montecarlo=1, min_size=4, silence=silencios, 
                                                vector_importancias=ponderaciones, verbose=False)
            mean = len(sg.ultra_light_segmentation(datos, fecha=fecha))/2
        else:
            if len(light_segs) == 0:
                light_segs = sg.ultra_light_segmentation(datos, fecha=fecha) 
                
            segmentos = light_segs
            mean = len(segmentos)/2
            #mean = int(datos.shape[0]/100)
    
        if carga_computacional == 1:
            segmentados = cl.apply_segmentation(datos, segmentos, silencios, ponderaciones, fecha)
            
            if segmentados.shape[0] <= 6:
                return 0.0
            else:
                std = np.sqrt(mean)
                nsegs=[]
                for i in range(segmentados.shape[0]):
                    nsegs.append([i,i+1])
                segmentos = sg.join_segments(data=segmentados, o_segments=nsegs, distance=sg.interpretable_distance, threshold=0.5, minimum_size=1,silence=silencios, vector_importancias=ponderaciones)[0]
                segmentados = cl.apply_segmentation(segmentados, segmentos, silencios, ponderaciones, fecha)

                return cl.hopkins_statistic(cl.filter_numerical(segmentados),m=int(segmentados.shape[0]*0.5))* normal_correction(mean, std, len(segmentados))
            
        elif carga_computacional==2:
            if tipo_clustering == "DTW":
                segmentados = sg.get_segments(datos, segmentos)
                asignaciones = pam.kmedoids(segmentados, n_clus = ncluster)
                segments_df = cl.apply_clustering(datos, segmentos, asignaciones[1], asignaciones[2])
            elif tipo_clustering == "KMEANS":
                segments_df = cl.full_clustering(datos, segmentos, n_clus = ncluster, silencio=silencios, pesos = ponderaciones, normalizar=False)
                
            if metrica == "SIL":    
                return cl.sil_metric(segments_df[0])
            elif metrica == "ENTROPY":
                return entropy_metric(segments_df)
Ejemplo n.º 2
0
def clara(data, k):
	'''
	CLARA implemenation
	1. For i = 1 to 5, repeat the following steps:
	2. Draw a sample of 40 + 2k objects randomly from the
		entire data set,2 and call Algorithm PAM to find
		k medoids of the sample.
	3. For each object Oj in the entire data set, determine
		which of the k medoids is the most similar to Oj.
	4. Calculate the average dissimilarity of the clustering
		obtained in the previous step. If this value is less
		than the current minimum, use this value as the
		current minimum, and retain the k medoids found in
		Step 2 as the best set of medoids obtained so far.
	5. Return to Step 1 to start the next iteration.
	'''
	size = len(data)
	min_avg_cost = float('inf')
	best_choice = []
        best_res = {}

	for i in range(claraLoopNum):
		# Construct the sampling subset
		sampling_idx = random.sample([i for i in range(size)], (40+k*2))
		sampling_data = []
		for idx in sampling_idx:
			sampling_data.append(data[idx])

		# Run kmedoids for the sampling
		pre_cost, pre_choice, pre_medoids = kmedoids(sampling_data, k)
		if debugEnabled == True:
			print('pre_cost: ', pre_cost)
			print('pre_choice: ', pre_choice)
			print('pre_medioids: ', pre_medoids)

		# Convert the pre_choice from sampling_data to the whole data
		pre_choice2 = []
		for idx in pre_choice:
			idx2 = data.index(sampling_data[idx])
			pre_choice2.append(idx2)
		if debugEnabled == True:
			print('pre_choice2: ', pre_choice2)

		# Clustering for all data set
		tmp_avg_cost, tmp_medoids = averageCost(data, 0, pre_choice2)
		if debugEnabled == True:
			print('tmp_avg_cost: ', tmp_avg_cost)
			print('tmp_medoids: ', tmp_medoids)

		# Update the best
		if tmp_avg_cost <= min_avg_cost:
			min_avg_cost = tmp_avg_cost
			best_choice = list(pre_choice2)
			best_res = dict(tmp_medoids)
		
	return(min_avg_cost, best_choice, best_res)
Ejemplo n.º 3
0
def clara(data, k):
	'''
	CLARA implemenation
	1. For i = 1 to 5, repeat the following steps:
	2. Draw a sample of 40 + 2k objects randomly from the
		entire data set,2 and call Algorithm PAM to find
		k medoids of the sample.
	3. For each object Oj in the entire data set, determine
		which of the k medoids is the most similar to Oj.
	4. Calculate the average dissimilarity of the clustering
		obtained in the previous step. If this value is less
		than the current minimum, use this value as the
		current minimum, and retain the k medoids found in
		Step 2 as the best set of medoids obtained so far.
	5. Return to Step 1 to start the next iteration.
	'''
	size = len(data)
	min_avg_cost = float('inf')
	best_choice = []
        best_res = {}

	for i in range(claraLoopNum):
		# Construct the sampling subset
		sampling_idx = random.sample([i for i in range(size)], (40+k*2))
		sampling_data = []
		for idx in sampling_idx:
			sampling_data.append(data[idx])

		# Run kmedoids for the sampling
		pre_cost, pre_choice, pre_medoids = kmedoids(sampling_data, k)
		if debugEnabled == True:
			print('pre_cost: ', pre_cost)
			print('pre_choice: ', pre_choice)
			print('pre_medioids: ', pre_medoids)

		# Convert the pre_choice from sampling_data to the whole data
		pre_choice2 = []
		for idx in pre_choice:
			idx2 = data.index(sampling_data[idx])
			pre_choice2.append(idx2)
		if debugEnabled == True:
			print('pre_choice2: ', pre_choice2)

		# Clustering for all data set
		tmp_avg_cost, tmp_medoids = averageCost(data, 0, pre_choice2)
		if debugEnabled == True:
			print('tmp_avg_cost: ', tmp_avg_cost)
			print('tmp_medoids: ', tmp_medoids)

		# Update the best
		if tmp_avg_cost <= min_avg_cost:
			min_avg_cost = tmp_avg_cost
			best_choice = list(pre_choice2)
			best_res = dict(tmp_medoids)
		
	return(min_avg_cost, best_choice, best_res)
Ejemplo n.º 4
0
def full_clustering(X,
                    segmentos,
                    n_clus=3,
                    mode="K-Means",
                    silencio=[],
                    pesos=None,
                    normalizar=True):
    '''
    Given a data frame and their segmentation (segments indexes) this function applies
    clustering for each segment and returns the dataframe with the representation of 
    the segments.
    '''
    X.drop('cluster', axis=1, errors='ignore', inplace=True)
    if pesos is None:
        pesos = [1] * (len(list(X._get_numeric_data())) - len(silencio) * 2)

    if mode == "K-Means":

        segments_df = apply_segmentation(X, segmentos, silencio, pesos)

        if normalizar:
            segments_df = minmax_norm(segments_df)

        fit = clustering(segments_df, n_clus)

        segments_df['cluster'] = fit.labels_
        segments_df['cluster'] = segments_df['cluster'].astype(str)

    else:
        X_num = filter_numerical(X)

        if normalizar:
            X_num = minmax_norm(X_num)

        X_num = filter_silence(X_num, silencio)
        X_np = np.array(X_num)

        X_segments = get_segments_nparray(X_np, segmentos)

        _, best_choice, best_res = pam.kmedoids(X_segments, n_clus)

        fit = FTWFit(best_choice, filter_numerical(X))

        segments_df = apply_segmentation(X, segmentos, silencio, pesos)
        segments_df = add_clustering_segments(segments_df, best_choice,
                                              best_res)
        if normalizar:
            segments_df = minmax_norm(segments_df)

    return segments_df, fit
Ejemplo n.º 5
0
def clara(data, k, COST=0, distDictClara={}, simDictClara={},
          affinities={}, bagSize=BagSize, namedPoints=True,
          degreeSelection=degreeSelection, claraLoopNum=claraLoopNum,
          noIsolates=True, saveAllResults=False, acceleration=0, take_all_nodes=False):
    '''
    CLARA implemenation
    1. For i = 1 to 5, repeat the following steps:
    2. Draw a sample of 40 + 2k objects randomly from the
        entire data set,2 and call Algorithm PAM to find
        k medoids of the sample.
    3. For each object Oj in the entire data set, determine
        which of the k medoids is the most similar to Oj.
    4. Calculate the average dissimilarity of the clustering
        obtained in the previous step. If this value is less
        than the current minimum, use this value as the
        current minimum, and retain the k medoids found in
        Step 2 as the best set of medoids obtained so far.
    5. Return to Step 1 to start the next iteration.
    '''
    size = len(data)
    min_cost = float('inf')
    best_choice = []
    best_res = {}
    sampling_idx = []
    cost_list = []
    isolates = []

    print "clara COST: ", COST

    print "take all nodes: ", take_all_nodes
    if take_all_nodes:
        bagSize = len(affinities)

    if saveAllResults:
        allResults=[]

    def IDtoname(data, best_med):
        best_med_names = {}
        best_choice_names = []
        for medID in best_med.keys():
            #best_choice_names.append(data[medID])
            best_med_names[data[medID]] = []
            for pointID in best_med[medID]:
                best_med_names[data[medID]].append(data[pointID])
        #best_choice = best_choice_names
        best_med = best_med_names
        return best_med

    # if degreeSelection == True, then 4*k nodes with highest degree will be
    # sampled anyway (the rest of nodes for the subsample (the number is Bag_Size)
    # are sampled as usual)
    if degreeSelection:
        degree = {}
        sampling_data_permanent = []
        sampling_data = []
        def getDegree(item):
            return item[1]

        # Compute sorted list of node degrees
        for i in list(data): # in order to not run out of range
            # remove singletons (treat them as isolates)
            # first compute weighted degree
            # TODO: find out the reasons for it
            try:
                degree[i] = sum(affinities[i].values())
            except KeyError:
                print "not in affinities: ", i
                degree[i] = 0
            #print "degree of %s is %f" % (i, degree[i])
            if degree[i]==0.0 and noIsolates == True:
                #print "~~~~~~~~~~EUREKA!!!!~~~~~~~~~~"
                isolates.append(i)
                data.remove(i)

            # Then remove paired nodes the same way as singletons.
            # As algorithm doesn't require absolute connectivity,
            # we deliberately leave components with 3+ nodes, hoping
            # that they will appear to be separate clusters after
            # averaging multiple randomized clara's results with kopt.py.
        for i in list(data):
            try:
                if len(affinities[i])==1 and len(affinities[affinities[i].keys()[0]])==1:
                    isolates.append(i)
                    data.remove(i)
            except: print "LOOK OUT: ", len(affinities[i]), affinities[i]
        # list rather than dict because dict cannot be ordered
        degree = sorted(degree.items(), key=getDegree, reverse=True)
        print "degrees obtained"

        # Obtain the bag of most prominent nodes for Clara clustering
        limit = k*4
        if k*4 > len(data):
            print "used up all data points for degree selection: %d points instead of 4k = %d" %(len(data), 4*k)
            limit = len(data)
        for point in degree:
            if len(sampling_data_permanent) >= limit:
                break
            sampling_data_permanent.append(point[0])
        therest = [point for point in data if point not in sampling_data]
        print "len(therest): ", len(therest)
        print "len(data): ", len(data)
        print "bagSize , bagSize - k*4: ", bagSize, bagSize-k*4

    iterspot = 0
    for i in range(claraLoopNum):
        iterspot += 1
        print "\n\nRUN No.", iterspot
        # Construct the sampling subset
        if degreeSelection == False:
            sampling_data = []
            sampling_idx = random.sample([i for i in range(size)], bagSize)
            for idx in sampling_idx:
                sampling_data.append(data[idx])
        else:
            sampling_data = list(sampling_data_permanent)
            sampling_idx = random.sample([i for i in range(len(therest))], bagSize-k*4)
            for idx in sampling_idx:
                sampling_data.append(therest[idx])
        print "all nodes/points:                  ", len(sampling_data)
        print "permanently selected nodes/points: ", len(sampling_data_permanent)
        # Run kmedoids for the sampling
        pre_cost, pre_choice, pre_medoids = kmedoids(sampling_data, k, COST, distDictKM=distDictClara,
                                                     simDictKM=simDictClara, namedPoints=False,
                                                     acceleration=acceleration)
        if debugEnabled == True:
            print('pre_cost: ', pre_cost)
            print('pre_choice: ', pre_choice)
            print('pre_medoids: ', pre_medoids) # pre_medoids are not too long to display

        # Convert the pre_choice from sampling_data to the whole data
        pre_choice2 = []
        for idx in pre_choice:
            #print sampling_data[idx]
            idx2 = data.index(sampling_data[idx])
            pre_choice2.append(idx2)
        if debugEnabled == True:
            print('pre_choice2: ', pre_choice2)

        # Clustering for all data set
        tmp_cost, tmp_medoids = targetFunction(data, COST, pre_choice2, distDict=distDictClara,
                                               simDict=simDictClara, affinities=affinities)
        cost_list.append(tmp_cost)
        if debugEnabled == True:
            print 'tmp_cost: ', tmp_cost
            print 'tmp_medoids: ', 'OK' #tmp_medoids)

        # If the points are named, display the names (refactor it)
        if namedPoints:
            tmp_medoids = IDtoname(data, tmp_medoids)
            pre_choice2 = tmp_medoids.keys()

        # Update the best
        if tmp_cost <= min_cost:
            min_cost = tmp_cost
            best_choice = list(pre_choice2)
            best_res = dict(tmp_medoids)

        if saveAllResults:
            allResults.append(tmp_medoids)

    if saveAllResults:
        return(min_cost, best_choice, best_res, cost_list, isolates, allResults)
    else:
        return(min_cost, best_choice, best_res, cost_list, isolates)
Ejemplo n.º 6
0
def clara(data,
          k,
          COST=0,
          distDictClara={},
          simDictClara={},
          affinities={},
          bagSize=BagSize,
          namedPoints=True,
          degreeSelection=degreeSelection,
          claraLoopNum=claraLoopNum,
          noIsolates=True,
          saveAllResults=False,
          acceleration=0,
          take_all_nodes=False):
    '''
    CLARA implemenation
    1. For i = 1 to 5, repeat the following steps:
    2. Draw a sample of 40 + 2k objects randomly from the
        entire data set,2 and call Algorithm PAM to find
        k medoids of the sample.
    3. For each object Oj in the entire data set, determine
        which of the k medoids is the most similar to Oj.
    4. Calculate the average dissimilarity of the clustering
        obtained in the previous step. If this value is less
        than the current minimum, use this value as the
        current minimum, and retain the k medoids found in
        Step 2 as the best set of medoids obtained so far.
    5. Return to Step 1 to start the next iteration.
    '''
    size = len(data)
    min_cost = float('inf')
    best_choice = []
    best_res = {}
    sampling_idx = []
    cost_list = []
    isolates = []

    print "clara COST: ", COST

    print "take all nodes: ", take_all_nodes
    if take_all_nodes:
        bagSize = len(affinities)

    if saveAllResults:
        allResults = []

    def IDtoname(data, best_med):
        best_med_names = {}
        best_choice_names = []
        for medID in best_med.keys():
            #best_choice_names.append(data[medID])
            best_med_names[data[medID]] = []
            for pointID in best_med[medID]:
                best_med_names[data[medID]].append(data[pointID])
        #best_choice = best_choice_names
        best_med = best_med_names
        return best_med

    # if degreeSelection == True, then 4*k nodes with highest degree will be
    # sampled anyway (the rest of nodes for the subsample (the number is Bag_Size)
    # are sampled as usual)
    if degreeSelection:
        degree = {}
        sampling_data_permanent = []
        sampling_data = []

        def getDegree(item):
            return item[1]

        # Compute sorted list of node degrees
        for i in list(data):  # in order to not run out of range
            # remove singletons (treat them as isolates)
            # first compute weighted degree
            # TODO: find out the reasons for it
            try:
                degree[i] = sum(affinities[i].values())
            except KeyError:
                print "not in affinities: ", i
                degree[i] = 0
            #print "degree of %s is %f" % (i, degree[i])
            if degree[i] == 0.0 and noIsolates == True:
                #print "~~~~~~~~~~EUREKA!!!!~~~~~~~~~~"
                isolates.append(i)
                data.remove(i)

            # Then remove paired nodes the same way as singletons.
            # As algorithm doesn't require absolute connectivity,
            # we deliberately leave components with 3+ nodes, hoping
            # that they will appear to be separate clusters after
            # averaging multiple randomized clara's results with kopt.py.
        for i in list(data):
            try:
                if len(affinities[i]) == 1 and len(
                        affinities[affinities[i].keys()[0]]) == 1:
                    isolates.append(i)
                    data.remove(i)
            except:
                print "LOOK OUT: ", len(affinities[i]), affinities[i]
        # list rather than dict because dict cannot be ordered
        degree = sorted(degree.items(), key=getDegree, reverse=True)
        print "degrees obtained"

        # Obtain the bag of most prominent nodes for Clara clustering
        limit = k * 4
        if k * 4 > len(data):
            print "used up all data points for degree selection: %d points instead of 4k = %d" % (
                len(data), 4 * k)
            limit = len(data)
        for point in degree:
            if len(sampling_data_permanent) >= limit:
                break
            sampling_data_permanent.append(point[0])
        therest = [point for point in data if point not in sampling_data]
        print "len(therest): ", len(therest)
        print "len(data): ", len(data)
        print "bagSize , bagSize - k*4: ", bagSize, bagSize - k * 4

    iterspot = 0
    for i in range(claraLoopNum):
        iterspot += 1
        print "\n\nRUN No.", iterspot
        # Construct the sampling subset
        if degreeSelection == False:
            sampling_data = []
            sampling_idx = random.sample([i for i in range(size)], bagSize)
            for idx in sampling_idx:
                sampling_data.append(data[idx])
        else:
            sampling_data = list(sampling_data_permanent)
            sampling_idx = random.sample([i for i in range(len(therest))],
                                         bagSize - k * 4)
            for idx in sampling_idx:
                sampling_data.append(therest[idx])
        print "all nodes/points:                  ", len(sampling_data)
        print "permanently selected nodes/points: ", len(
            sampling_data_permanent)
        # Run kmedoids for the sampling
        pre_cost, pre_choice, pre_medoids = kmedoids(sampling_data,
                                                     k,
                                                     COST,
                                                     distDictKM=distDictClara,
                                                     simDictKM=simDictClara,
                                                     namedPoints=False,
                                                     acceleration=acceleration)
        if debugEnabled == True:
            print('pre_cost: ', pre_cost)
            print('pre_choice: ', pre_choice)
            print('pre_medoids: ', pre_medoids
                  )  # pre_medoids are not too long to display

        # Convert the pre_choice from sampling_data to the whole data
        pre_choice2 = []
        for idx in pre_choice:
            #print sampling_data[idx]
            idx2 = data.index(sampling_data[idx])
            pre_choice2.append(idx2)
        if debugEnabled == True:
            print('pre_choice2: ', pre_choice2)

        # Clustering for all data set
        tmp_cost, tmp_medoids = targetFunction(data,
                                               COST,
                                               pre_choice2,
                                               distDict=distDictClara,
                                               simDict=simDictClara,
                                               affinities=affinities)
        cost_list.append(tmp_cost)
        if debugEnabled == True:
            print 'tmp_cost: ', tmp_cost
            print 'tmp_medoids: ', 'OK'  #tmp_medoids)

        # If the points are named, display the names (refactor it)
        if namedPoints:
            tmp_medoids = IDtoname(data, tmp_medoids)
            pre_choice2 = tmp_medoids.keys()

        # Update the best
        if tmp_cost <= min_cost:
            min_cost = tmp_cost
            best_choice = list(pre_choice2)
            best_res = dict(tmp_medoids)

        if saveAllResults:
            allResults.append(tmp_medoids)

    if saveAllResults:
        return (min_cost, best_choice, best_res, cost_list, isolates,
                allResults)
    else:
        return (min_cost, best_choice, best_res, cost_list, isolates)