def test_kmodes_predict_soybean(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2)
     kmodes_cao = kmodes_cao.fit(SOYBEAN)
     result = kmodes_cao.predict(SOYBEAN2)
     expected = np.array([2, 1, 3, 0])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
Beispiel #2
0
 def test_kmodes_predict_soybean_ng(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
     kmodes_cao = kmodes_cao.fit(SOYBEAN)
     result = kmodes_cao.predict(SOYBEAN2)
     expected = np.array([2, 1, 3, 0])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
Beispiel #3
0
#print(housing_binary)
#print(len(housing_binary))
#print(u_housing[i_housing])
#print(len(u_housing[i_housing]))


# LIFT

km = KModes(n_clusters = 5)
#kmeans = KMeans(n_clusters = 5)
X = np.vstack((i_plaintiff,i_judgment_type, i_judgment_method,d_a, g_num))
X = np.transpose(X)
X = np.hstack((X,low))
#print(X)
km.fit(X)
y_km = km.predict(X)
#print(X[0,:])
#print(X[1,:])
plt.scatter(latitude, longitude, c = y_km, s = 50, cmap='winter')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.show()

#print(y_km)


units = np.array(df['units'])
number = np.nonzero(units)
print(number)
number = np.array(number)
units_final = units[number]
Beispiel #4
0
def fit(qual_id, count):
    # More than 500 documents results in slow training
    if count >= 500:
        count = 500
    # Query for passed qual_id with incorrect answers
    # May take a lengthly amount of time. Recommend optimizing query.
    # print("Querying for", qual_id)
    data = collection.find({"qual_id": qual_id, "correct": False})[:count]
    # print("Query complete.")

    # Compile dictionary of all possible features in given list of records
    # print("Compiling dictionary of features.")
    features = {}
    for doc in data:
        doc_features = {}
        if doc['response'] is None:
            continue
        doc_features = retrieveKeys(doc['response'], doc_features)
        features = mergeFeatures(doc_features, features)
    # print("Feature compilation complete.")

    # Count number of features
    length = countFeatures(features)
    if length == 0:
        return

    # Reuse queried documents.
    data = data.rewind()

    # Append missing features to all records and assign common benign value.
    # Current benign value is an empty string.
    # print("Appending features to documents.")
    student_data = np.array([])
    for doc in data:
        if doc['response'] is None:
            continue
        else:
            temp = np.array([])
            temp = addFeatures(features, temp, doc['response'])
            if len(student_data) == 0:
                student_data = np.append(student_data, temp)
                student_data = np.reshape(student_data, (-1, length))
            else:
                student_data = np.append(student_data, [temp], axis=0)
    # print("Finished appending features to documents.")

    # Perform k-modes clustering
    # print("Clustering...")
    clusters = len(student_data)
    # K-modes implementation can't generate more than 255 centroids
    if clusters > 255:
        clusters = 255
    km = KModes(n_clusters=clusters, init='Cao', n_init=4, verbose=False)
    # print("Finished.")
    km.fit(student_data)

    # Print important information from clustering
    # Centroids are common values to each cluster
    centroids = km.cluster_centroids_
    # print("Centroids")
    # print(centroids)

    # Labels is a list indicating which cluster each record belongs to
    labels = km.labels_
    # print("Labels")
    # print(labels)

    # Cost is value indicating possible error in the clusters. Ideal value is
    # 0.0. If value is greater than 0.0, then the max number of clusters were
    # generated and some responses were assigned to an inexact cluster. This.
    # would result in the largest cluster having having documents it shouldn't.
    # Recommend re-clustering with fewer documents or more clusters if possible.
    cost = km.cost_
    # print("Cost")
    # print(cost)

    # Prints 5 largest cluster labels and number of records per cluster.
    most_common = Counter(labels).most_common(5)
    # print("Most populated centroids")
    # print(most_common)

    # Generate cluster dictionary to be inserted in the centroid_db.
    # Qual_id: qual_id of given documents
    # Features: Dictionary of all possible features in passed documents.
    # Centroids: List of generated centroids.
    # Cluster_sizes: Number of documents in each cluster.
    # Behavioral_traits: Behavioral traits associated with at least one
    # document assigned to the given centroid.
    # Screenshot_urls: A screenshot from one document within each cluster.
    # Centroids and behavioral_traits have the same lengths. The behavioral
    # traits in a given index of behavioral_traits is associated with the same
    # index of centroids.
    post = {
        'qual_id': qual_id,
        'features': features,
        'centroids': {},
        'cluster_sizes': {},
        'behavioral_traits': {},
        'screenshot_urls': {}
    }

    for i in Counter(labels).most_common(len(centroids)):
        if str(i[0]) not in post['cluster_sizes']:
            post['cluster_sizes'][str(i[0])] = str(i[1])

    for i in range(len(centroids.tolist())):
        if str(i) not in post['centroids']:
            post['centroids'][str(i)] = centroids.tolist()[i]

    # Reuse queried documents.
    data = data.rewind()
    label = 0
    for doc in data:
        if doc['response'] is None:
            continue
        elif str(labels[label]) not in post['screenshot_urls']:
            post['screenshot_urls'][str(labels[label])] = doc['screenshot_url']
            label += 1
        else:
            label += 1

    # Reuse queried documents.
    data = data.rewind()

    # Add associated behavioral traits to cluster dictionary.
    for doc in data:
        if doc['response'] is None:
            continue
        else:
            temp = np.array([])
            temp = addFeatures(features, temp, doc['response'])
            temp = np.reshape(temp, (-1, length))
            label = km.predict(temp)[0]
            if str(label) not in post['behavioral_traits']:
                post['behavioral_traits'][str(
                    label)] = doc['behavioral_traits']

    # Add generated cluster dictionary to centroid_db.
    # If a record shares the same qual_id as the generated cluster dictionary,
    # then the stored record will be overwritten.
    # print("Posting centroids to database centroids.")
    centroid_db.replace_one({'qual_id': qual_id}, post, upsert=True)
 def test_kmodes_predict_unfitted(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2)
     with self.assertRaises(AssertionError):
         kmodes_cao.predict(SOYBEAN)
     with self.assertRaises(AttributeError):
         kmodes_cao.cluster_centroids_
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        if X is None:
            return []

        # check the datatype of user-defined columns
        if not isinstance(include_columns, list):
            raise ValueError("Variable: 'include_columns' should be <list>")
        if not isinstance(ignore_columns, list):
            raise ValueError("Column: 'ignore_columns' should be <list>")
        if not isinstance(num_clusters, int):
            raise ValueError("Column: 'num_clusters' should be <int>")

        ## validate user-inputs and override the columns given by user
        features = list(X.names)
        if len(include_columns) > 0:
            for _ in include_columns:
                if _ not in list(X.names):
                    raise ValueError("Column: '" + str(_) +
                                     "' is not present in the dataset")
            features = include_columns

        ## list to ignore specific columns given by user
        features = [_f for _f in features if _f not in ignore_columns]

        ## handle columns with missing values
        ignore_ = []
        X_df = X.to_pandas()
        for col in features:
            # label encode categorical columns
            # refer - https://github.com/h2oai/driverlessai-recipes/pull/68#discussion_r365133392

            if X_df[col].dtype == "object":
                X_df[f"{col}_enc"] = LabelEncoder().fit_transform(
                    X_df[col].to_numpy())
                ignore_.append(col)

            miss_percent = X_df[col].isna().sum() / X_df.shape[0]
            if miss_percent >= 0.3:  # ignore columns having more than 30% missing values
                ignore_.append(col)
            elif miss_percent > 0.0:  # impute by mean for other columns with missing values
                X_df[col] = X_df[col].fillna(X_df[col].mean())

        features = [f for f in features if f not in ignore_]
        features += [_f for _f in X_df.columns if "_enc" in _f]
        if len(features) == 0:
            raise ValueError("Unable to cluster: No useful features available")

        X_clust = X_df[features].values

        # Apply min max scaling
        X_clust = MinMaxScaler().fit_transform(X_clust)

        # Go through possible numbers of clusters
        best_score = None
        best_n_clust = None
        best_clust_ids = None

        ## if number of clusters is pre-defined by user, then dont find the optimal
        if num_clusters > 1:
            model = KModes(n_clusters=num_clusters,
                           n_jobs=NUM_JOBS).fit(X_clust)
            clust_ids = model.predict(X_clust).astype(np.int32)
            score = my_davies_bouldin_score(X_clust, clust_ids)
            best_score = score
            best_n_clust = num_clusters
            best_clust_ids = clust_ids

        else:
            for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS,
                                    CLUSTER_STEP_SIZE):
                model = KModes(n_clusters=n_clusters,
                               n_jobs=NUM_JOBS).fit(X_clust)
                clust_ids = model.predict(X_clust).astype(np.int32)
                score = my_davies_bouldin_score(X_clust, clust_ids)
                improve = False
                if best_score is None:
                    improve = True
                elif best_score > score:
                    improve = True

                if improve:
                    best_score = score
                    best_n_clust = n_clusters
                    best_clust_ids = clust_ids

        if best_score is None:
            return []
        else:
            X[:, f'kmodes{best_n_clust}'] = dt.Frame(best_clust_ids)
        return X
Beispiel #7
0
def fit(qual_id, count):
    # More than 500 documents results in slow training
    if count >= 500:
        count = 500
    # Query for passed qual_id with incorrect answers
    # May take a lengthly amount of time. Recommend optimizing query.
    if FLAG_VERBOSE:
        print("Querying for", qual_id)
    data = collection.find({"qual_id": qual_id, "correct": False})[:count]
    if FLAG_VERBOSE:
        print("Query complete.")

    # Compile dictionary of all possible features in given list of records
    if FLAG_VERBOSE:
        print("Compiling dictionary of features.")
    num_examples = 0
    num_empty = 0
    features = {}
    for doc in data:
        doc_features = {}
        if doc['response'] is None:
            num_empty += 1
            continue
        doc_features = retrieveKeys(doc['response'], doc_features)
        features = mergeFeatures(doc_features, features, "")
        num_examples += 1
    if FLAG_VERBOSE:
        print("Feature compilation complete.")

    # Count number of features
    num_features = countFeatures(features)
    if FLAG_VERBOSE:
        print("*** Number of features: {}".format(num_features))
        print(
            "*** Number of non-empty records for [Q_ID:{}]: {}. (dropped {} with empty resp)"
            .format(qual_id, num_examples, num_empty))
    if num_features == 0:
        return

    # Reuse queried documents.
    data = data.rewind()

    # Append missing features to all records and assign common benign value.
    # Current benign value is an empty string.
    # print("Appending features to documents.")
    # faster to create zeroed np array first, rather then appending
    student_data = np.zeros((num_examples, num_features), dtype='<U32')
    i = 0
    for doc in data:
        if doc['response'] is None:
            continue
        else:
            temp = addFeatures(features, [], doc['response'])
            student_data[i, :] = temp
            i += 1
    if FLAG_VERBOSE:
        print("Finished appending features to documents.")
        print(student_data)
    #print("*** Features: ***")
    #pprint(interpretFeatures(features, []))

    # print feature vectors
    #print("*** FEATURE VECTOR: ***")
    #i = 0
    #for row in student_data:
    #    print("[{}]: {}".format(i, row))
    #    i += 1
    #print(repr(student_data))

    # Perform k-modes clustering
    print("Clustering...")
    clusters = NUM_CLUSTERS
    # K-modes implementation can't generate more than 255 centroids
    if clusters > 255:
        clusters = 255
    if clusters > len(student_data):
        clusters = len(student_data)
    km = KModes(n_clusters=clusters, init='Cao', n_init=4, verbose=False)
    km.fit(student_data)
    print("Finished.")

    # Print important information from clustering
    # Centroids are common values to each cluster
    centroids = km.cluster_centroids_
    if FLAG_VERBOSE:
        print("*** CENTROIDS: ***")
        print(centroids)

    # Labels is a list indicating which cluster each record belongs to
    labels = km.labels_
    if FLAG_VERBOSE:
        print("*** LABELS: ***")
        print(labels)

    # Cost is value indicating possible error in the clusters. Ideal value is 0.0
    if FLAG_VERBOSE:
        cost = km.cost_
        print("*** COST: ***")
        print(cost)

    # Prints 5 largest cluster labels and number of records per cluster.
    if FLAG_VERBOSE:
        most_common = Counter(labels).most_common(5)
        print("Most populated centroids")
        print(most_common)

    # Generate cluster dictionary to be inserted in the centroid_db.
    # Qual_id: qual_id of given documents
    # Features: Dictionary of all possible features in passed documents.
    # Centroids: List of generated centroids.
    # Behavioral_traits: Behavioral traits associated with at least one
    # document assigned to the given centroid.
    # Centroids and behavioral_traits have the same lengths. The behavioral
    # traits in a given index of behavioral_traits is associated with the same
    # index of centroids.
    if FLAG_USE_CENTROID_DB:
        post = {
            'qual_id': qual_id,
            'features': features,
            'centroids': centroids.tolist(),
            'behavioral_traits': {}
        }

        # Reuse queried documents.
        data = data.rewind()

        # Add associated behavioral traits to cluster dictionary.
        for doc in data:
            if doc['response'] is None:
                continue
            else:
                temp = np.array([])
                temp = addFeatures(features, temp, doc['response'])
                temp = np.reshape(temp, (-1, num_features))
                label = km.predict(temp)[0]
                if str(label) not in post['behavioral_traits']:
                    post['behavioral_traits'][str(
                        label)] = doc['behavioral_traits']
                X_ids.append(doc['_id'])

        # Add generated cluster dictionary to centroid_db.
        # If a record shares the same qual_id as the generated cluster dictionary,
        # then the stored record will be overwritten.
        print("Posting centroids to database centroids.")
        centroid_db.replace_one({'qual_id': qual_id}, post, upsert=True)
        print(qual_id, "complete.")
        print()

    if FLAG_DO_ANALYSIS:
        # perform some automatic EDA on largest clusters and save
        # collect ids of examples
        data = data.rewind()
        X_ids = []
        for doc in data:
            if doc['response'] is None:
                continue
            else:
                X_ids.append(doc['_id'])
        out_dir = ANALYS_OUT_DIR
        if out_dir is None:
            out_dir = "./out/" + str(qual_id)
        analys = cluster_analyzer(collection, out_dir)
        analys.analyze(student_data, labels, centroids, X_ids, qual_id,
                       interpretFeatures(features, []))
Beispiel #8
0
 def test_kmodes_predict_unfitted(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2)
     with self.assertRaises(AssertionError):
         kmodes_cao.predict(SOYBEAN)
     with self.assertRaises(AttributeError):
         kmodes_cao.cluster_centroids_
Beispiel #9
0
print()

## e)
dataFrame['Type'] = dataFrame['Type'].astype('category')
dataFrame['Origin'] = dataFrame['Origin'].astype('category')
dataFrame['DriveTrain'] = dataFrame['DriveTrain'].astype('category')
dataFrame['Cylinders'] = dataFrame['Cylinders'].astype('category')

cat_col = dataFrame.select_dtypes(['category']).columns
df = dataFrame[cat_col].apply(lambda x: x.cat.codes)

km = KModes(n_clusters=3, init='Huang', random_state=555)
clusters = km.fit(df)

cents = km.cluster_centroids_
predict_results = km.predict(df)
unique, counts = np.unique(predict_results, return_counts=True)
num_obs_in_each_cluster = dict(zip(unique, counts))


def showResult(i):
    print("The number of observations in cluster 1: %d" %
          num_obs_in_each_cluster[i])
    print("The number of observations in cluster 2: %d" %
          num_obs_in_each_cluster[i + 1])
    print("The number of observations in cluster 3: %d" %
          num_obs_in_each_cluster[i + 2])


for x in range(0, 1):
    showResult(x)
Beispiel #10
0
#train['total_miss_square']=train['total_miss']**2
#test['total_miss_square']=test['total_miss']**2
#for n in bincol+normcol04+normcol59+monthday:
##    _,test[n+'_freq']=FreqEncode(train[n],test[n])
#    _,test[n+'_target']=TargetEncode(train[n],test[n],target)
##    train[n+'_miss']=train[n].isna()
##    test[n+'_miss']=test[n].isna()
#te=ce.TargetEncoder(smoothing=0.3)
#te.fit(train,target)
#test=te.transform(test)
#==================k mode clustering========
from kmodes.kmodes import KModes

km = KModes(n_clusters=2, init="Cao", n_init=1, verbose=1, random_state=1990)
train['cluster'] = km.fit_predict(train_cluster)
test['cluster'] = km.predict(test_cluster)
##==========test independency
#import scipy.stats as scs
#
#def chi_square_of_df_cols(df, col1, col2):
#    df_col1, df_col2 = df[col1], df[col2]
#
#    result = [[sum((df_col1 == cat1) & (df_col2 == cat2))
#               for cat2 in df_col2.unique()]
#              for cat1 in df_col1.unique()]
#
#    return scs.chi2_contingency(result)
#
#chi_matrix=np.zeros([len(train_cluster.columns),len(train_cluster.columns)])
#for i,r in enumerate(train_cluster.columns):
#    for j,c in enumerate(train_cluster.columns):
Beispiel #11
0
#wcss = []
#for i in range(1,30):
#    kmodes = KModes(n_clusters=i, init='Huang', n_init=5, verbose=1)
#    kmodes.fit(data1)
#    wcss.append(kmodes.cluster_centroids_)

#plt.plot(range(1,30), wcss)
#plt.title("The elbow method")
#plt.xlabel("The number of clusters")
#plt.ylabel("WCSS")
#plt.show()

wcss
"""**Kmode Model Creation and prediction**"""

km = KModes(n_clusters=23, init='Huang', n_init=5, verbose=1)

km = km.fit(data1)

clusters = km.predict(data1)
# Print the cluster centroids
print(km.cluster_centroids_)
"""**Storing My Prediction to CSV file**"""

k = pd.DataFrame()

k['output'] = clusters

k.to_csv("outpt.csv")