コード例 #1
0
def evaluate_model(model, model_type, num_of_classes, candidate_feature_set, data_set):
    '''This method uses the inputted feature subset to cluster the inputted data and
    scores performance using a LDA-like objective function.'''
    # Convert candidate_feature_set representation from
    # f_1, ... f_d to the list of indices of the f_i = 1
    # (for example, [1 0 0 1 0] -> [0 3]
    candidate_feature_set = \
        [idx for idx in xrange(len(candidate_feature_set)) if candidate_feature_set[idx] == 1]
    if model_type == "Kmeans":
        model = KMeans(num_of_classes)
    elif model_type == "HAC":
        model = HAC(num_of_classes)
    model.cluster(data_set[:,candidate_feature_set])
    return model.calculate_performance() 
コード例 #2
0
def perform_SFS_feature_selection(model, model_type, num_of_classes, data_set):
    # Create a boolean string, 1 = include feature, 0 = leave it out
    feature_set = [i for i in xrange(data_set.shape[1])]
    chosen_features = []
    chosen_clusters = []
    base_performance = float("-inf")
    # while there are still features to choose from...
    while len(feature_set) > 0:
        # initialize performance metrics
        best_performance = float("-inf")
        best_clusters = []
        #print "best performance = %f" % best_performance
        # Pick a feature that hasn't be chosen yet and train the model
        for feature in feature_set:
            chosen_features.append(feature)
            # Train model
            if model_type == "Kmeans":
                model = KMeans(num_of_classes)
            elif model_type == "HAC":
                model = HAC(num_of_classes)
            #print "Modeling with %s" % chosen_features
            clusters = model.cluster(data_set)
            # Calculate performance via LDA-like objective function
            current_performance = model.calculate_performance()
            #print "model performance = %f" % current_performance
            # if this combo of features beats the best performance so far
            # take note...
            if current_performance > best_performance:
                best_performance = current_performance
                best_feature = feature
                best_clusters = clusters
                #print "best performance updated to %f" % best_performance
            chosen_features.remove(feature)
        # If best noted performance beats the best performance we've seen
        # so far, add to chosen features
        if best_performance > base_performance:
            base_performance = best_performance
            feature_set.remove(best_feature)
            chosen_features.append(best_feature)
            chosen_clusters = best_clusters
            #print "base performance = %f" % base_performance
        else:
            #print "best performance = %f" % base_performance
            break
    return chosen_features, chosen_clusters
コード例 #3
0
   data_file = open(test[0])
   print "Running with %s" % test[0]
   for line in data_file:
       line_split = line.split(',')
       data_instances.append(map(float, line_split))
   data_instances = np.array(data_instances)

   # Run SFS using k-means and HAC
   kmeans_model = KMeans(test[1])
   hac_model = HAC(test[1])

   # Glass dataset
   if "glass" in test[0]:
      kmeans_sfs_glass = np.array([1,3])
      kmeans_model.cluster(data_instances[:,kmeans_sfs_glass])
      print "Kmeans SFS glass performance = %f" % kmeans_model.calculate_performance()

      kmeans_ga_glass = np.array([0,1,2,3,4,5,6])
      kmeans_model = KMeans(test[1])
      kmeans_model.cluster(data_instances[:,kmeans_ga_glass])
      print "Kmeans GA glass performance = %f" % kmeans_model.calculate_performance()

      hac_sfs_glass = np.array([0])
      hac_model.cluster(data_instances[:,hac_sfs_glass])
      print "HAC SFS glass performance = %f" % hac_model.calculate_performance()

   # Iris dataset
   elif "iris" in test[0]:
      kmeans_sfs_iris = np.array([1])
      kmeans_model = KMeans(test[1])
      kmeans_model.cluster(data_instances[:,kmeans_sfs_iris])
コード例 #4
0
ファイル: main.py プロジェクト: sungwons/Machine_Learning
    print "Running with %s" % test[0]
    for line in data_file:
        line_split = line.split(',')
        data_instances.append(map(float, line_split))
    data_instances = np.array(data_instances)

    # Run SFS using k-means and HAC
    kmeans_model = KMeans(test[1])
    hac_model = HAC(test[1])

    # Glass dataset
    if "glass" in test[0]:
        kmeans_sfs_glass = np.array([1, 3])
        kmeans_model.cluster(data_instances[:, kmeans_sfs_glass])
        print("K-means SFS glass performance = %f" %
              kmeans_model.calculate_performance())

        kmeans_ga_glass = np.array([0, 1, 2, 3, 4, 5, 6])
        kmeans_model = KMeans(test[1])
        kmeans_model.cluster(data_instances[:, kmeans_ga_glass])
        print("K-means GA glass performance = %f" %
              kmeans_model.calculate_performance())

        hac_sfs_glass = np.array([0])
        hac_model.cluster(data_instances[:, hac_sfs_glass])
        print("HAC SFS glass performance = %f" %
              hac_model.calculate_performance())

    # Iris dataset
    elif "iris" in test[0]:
        kmeans_sfs_iris = np.array([1])