def answer(test_path): import warnings warnings.filterwarnings("ignore") import time t0 = time.time() from learning import process_test_data, training_data, training_answers from sklearn.cluster.k_means_ import KMeans from sklearn.linear_model.logistic import LogisticRegression test_data = process_test_data(test_path) km = KMeans() km.fit(training_data, training_answers) myNum = km.predict(test_data).item() numX = [1, 2, 4, 2, 7, 0, 2, 7, 4, 3, 2, 1, 4, 5, 5, 1, 3, 0, 4, 2] numbers = [[num] for num in numX] letX = [ 'a', 'a', 'o', 'a', 'o', 'o', 'a', 'a', 'o', 'a', 'a', 'o', 'a', 'o', 'o', 'o', 'a', 'a', 'o', 'a' ] letters = [[letter] for letter in letX] lr = LogisticRegression() lr.fit(numbers, letters) ans = lr.predict(myNum).item() t1 = time.time() return [ans, t1 - t0]
class KMeansImpl(): def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='auto'): self._hyperparams = { 'n_clusters': n_clusters, 'init': init, 'n_init': n_init, 'max_iter': max_iter, 'tol': tol, 'precompute_distances': precompute_distances, 'verbose': verbose, 'random_state': random_state, 'copy_x': copy_x, 'n_jobs': n_jobs, 'algorithm': algorithm } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X) def predict(self, X): return self._wrapped_model.predict(X)
center = pd.read_csv('service_center.csv', encoding='utf-8') # Import Customer data customer = pd.read_csv('customer.csv', encoding='utf-8') customer['coordinate'] = customer[['latitude', 'longitude']].apply(tuple, axis=1) # Standardize Longitude and Latitude scaler = StandardScaler() customer['longitude_std'] = scaler.fit_transform(customer[['longitude', 'latitude']])[:, 0] customer['latitude_std'] = scaler.fit_transform(customer[['longitude', 'latitude']])[:, 1] # K-means Clustering kmeans = KMeans(n_clusters=9, init='k-means++', n_init=10, max_iter=300, verbose=1, random_state=123) kmeans.fit_transform(customer[['longitude_std', 'latitude_std']]) centers = scaler.inverse_transform(kmeans.cluster_centers_) groups = kmeans.predict(customer[['longitude_std', 'latitude_std']]) customer['groups'] = groups # Coordinate of each cluster cluster_center = kmeans.cluster_centers_ cluster_center = scaler.inverse_transform(cluster_center) cluster_center = pd.DataFrame(cluster_center, columns=['longitude', 'latitude']) cluster_center['group'] = range(0, 9) # Plot Map init_location = [customer.loc[0, 'latitude'], customer.loc[0, 'longitude']] cluster_map = folium.Map(location=init_location, zoom_start=10) customer[customer.groups == 6].apply( lambda row: folium.CircleMarker(location=tuple([row['latitude'], row['longitude']]), radius=6, fill=True, color='blue').add_to(cluster_map), axis=1)
class KMeansEstimator: """ This class reads the tweets of users from a file and builds cluster centers on that data. It also provides method for finding the closest cluster center of unseen data. """ ADJECTIVE = 'JJ' """ Feature keys used in clustering... """ POLARITY_FEATURE_KEY = 'polarity' SUBJECTIVITY_FEATURE_KEY = 'subjectivity' TWEET_COUNT_FEATURE_KEY = 'tweetCount' """ Features not considered for clustering... """ USER_ID_FEATURE_KEY = 'userId' LONGITUDE_FEATURE_KEY = 'longitude' LATITUDE_FEATURE_KEY = 'latitude' """ Predicted label feature name. """ LABEL_FEATURE_KEY = 'label' RELEVENT_FEATURE_LIST = [ USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY ] def __init__(self, tweet_file_path, no_of_clusters): """ The constructor reads csv file and builds the data matrix. """ self.np_extractor = ConllExtractor() self.pos_tagger = NLTKTagger() self.tweet_file_path = tweet_file_path self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) self.vectorizer = DictVectorizer(sparse=True) self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters) @time_it def __get_data_matrix_from_file(self, tweet_file_path): """ Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list of all feature vectors. """ file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',') next(file_reader) data_matrix = [] for row in file_reader: logging.info("Extracting features for user_id:%s", row[0]) feature_vector = {} feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0]) feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2]) feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3]) feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4]) feature_vector.update( self.__get_features_from_tweet_text(row[1].decode('utf-8'))) data_matrix.append(feature_vector) logging.info("Successfully extracted features for user_id:%s", row[0]) return data_matrix @time_it def __get_features_from_tweet_text(self, tweet_text): """This function returns the following features from the tweet text: - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature. - Subjectivity and polarity as determined by TextBlob. :returns: (key,value) map of all features found. """ text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger) adjective_map = dict( Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE))) polarity = text_blob.sentiment[0] subjectivity = text_blob.sentiment[1] return dict( adjective_map.items() + { self.POLARITY_FEATURE_KEY: polarity, self.SUBJECTIVITY_FEATURE_KEY: subjectivity }.items()) @time_it def __get_clustering_data_matrix(self, data_matrix): """ This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from the data matrix and returns a copy of the data matrix. """ data_matrix_copy = copy.deepcopy(data_matrix) for feature_vector in data_matrix_copy: feature_vector.pop(self.USER_ID_FEATURE_KEY) feature_vector.pop(self.LATITUDE_FEATURE_KEY) feature_vector.pop(self.LONGITUDE_FEATURE_KEY) return data_matrix_copy @time_it def perform_clustering(self, features_to_include=None): """ This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at "tweet_file_path". It returns list of feature vector, where each feature vector contains only "features_to_include" or all features if "features_to_include" is None. """ clustering_data_matrix = self.__get_clustering_data_matrix( self.data_matrix) transformed_data_matrix = self.vectorizer.fit_transform( clustering_data_matrix) self.k_means_estimator.fit(transformed_data_matrix, y=None) return self.__get_predicted_labels(self.data_matrix, features_to_include) @time_it def __get_predicted_labels(self, data_matrix, features_to_include): """ Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The data matrix is modified in place. It returns a new copy of data_matrix with "features_to_include" features. """ feature_names = self.vectorizer.get_feature_names() for feature_vector in data_matrix: row = [0] * len(feature_names) column = range(len(feature_names)) data = map( lambda feature_name: feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names) feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column)))) predicted_label = self.k_means_estimator.predict( feature_csr_matrix) feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0] expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix) if features_to_include: return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include) else: return expanded_data_matrix @time_it def __get_filtered_data_matrix(self, data_matrix, features_to_include): """ Removes all features except features_to_include """ filtered_data_matrix = [] for feature_vector in data_matrix: filtered_feature_vector = {} for feature_name in features_to_include: filtered_feature_vector[feature_name] = feature_vector[ feature_name] filtered_data_matrix.append(filtered_feature_vector) return filtered_data_matrix @time_it def __get_expanded_data_matrix(self, data_matrix): """ Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new modified copy is returned. """ feature_names = self.vectorizer.get_feature_names() expanded_data_matrix = copy.deepcopy(data_matrix) for feature_vector in expanded_data_matrix: for feature_name in feature_names: if feature_name not in feature_vector: feature_vector[feature_name] = 0 return expanded_data_matrix @time_it def predict_labels_for_data(self, file_path, features_to_include=None): """ This function reads the tweets of different users from the file at file_path and assigns the closest cluster center to each user. It returns list of tuples of (user_id,predicted_label,latitude, longitude). """ data_matrix = self.__get_data_matrix_from_file(file_path) return self.__get_predicted_labels(data_matrix, features_to_include)
class KMeansEstimator: """ This class reads the tweets of users from a file and builds cluster centers on that data. It also provides method for finding the closest cluster center of unseen data. """ ADJECTIVE = 'JJ' """ Feature keys used in clustering... """ POLARITY_FEATURE_KEY = 'polarity' SUBJECTIVITY_FEATURE_KEY = 'subjectivity' TWEET_COUNT_FEATURE_KEY = 'tweetCount' """ Features not considered for clustering... """ USER_ID_FEATURE_KEY = 'userId' LONGITUDE_FEATURE_KEY = 'longitude' LATITUDE_FEATURE_KEY = 'latitude' """ Predicted label feature name. """ LABEL_FEATURE_KEY = 'label' RELEVENT_FEATURE_LIST = [USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY] def __init__(self, tweet_file_path, no_of_clusters): """ The constructor reads csv file and builds the data matrix. """ self.np_extractor = ConllExtractor() self.pos_tagger = NLTKTagger() self.tweet_file_path = tweet_file_path self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) self.vectorizer = DictVectorizer(sparse=True) self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters) @time_it def __get_data_matrix_from_file(self, tweet_file_path): """ Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list of all feature vectors. """ file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',') next(file_reader) data_matrix = [] for row in file_reader: logging.info("Extracting features for user_id:%s", row[0]) feature_vector = {} feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0]) feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2]) feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3]) feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4]) feature_vector.update(self.__get_features_from_tweet_text(row[1].decode('utf-8'))) data_matrix.append(feature_vector) logging.info("Successfully extracted features for user_id:%s", row[0]) return data_matrix @time_it def __get_features_from_tweet_text(self, tweet_text): """This function returns the following features from the tweet text: - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature. - Subjectivity and polarity as determined by TextBlob. :returns: (key,value) map of all features found. """ text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger); adjective_map = dict(Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE))) polarity = text_blob.sentiment[0] subjectivity = text_blob.sentiment[1] return dict(adjective_map.items() + {self.POLARITY_FEATURE_KEY:polarity, self.SUBJECTIVITY_FEATURE_KEY:subjectivity}.items()) @time_it def __get_clustering_data_matrix(self, data_matrix): """ This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from the data matrix and returns a copy of the data matrix. """ data_matrix_copy = copy.deepcopy(data_matrix) for feature_vector in data_matrix_copy: feature_vector.pop(self.USER_ID_FEATURE_KEY) feature_vector.pop(self.LATITUDE_FEATURE_KEY) feature_vector.pop(self.LONGITUDE_FEATURE_KEY) return data_matrix_copy @time_it def perform_clustering(self, features_to_include=None): """ This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at "tweet_file_path". It returns list of feature vector, where each feature vector contains only "features_to_include" or all features if "features_to_include" is None. """ clustering_data_matrix = self.__get_clustering_data_matrix(self.data_matrix) transformed_data_matrix = self.vectorizer.fit_transform(clustering_data_matrix) self.k_means_estimator.fit(transformed_data_matrix, y=None) return self.__get_predicted_labels(self.data_matrix, features_to_include) @time_it def __get_predicted_labels(self, data_matrix, features_to_include): """ Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The data matrix is modified in place. It returns a new copy of data_matrix with "features_to_include" features. """ feature_names = self.vectorizer.get_feature_names() for feature_vector in data_matrix: row = [0] * len(feature_names) column = range(len(feature_names)) data = map(lambda feature_name:feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names) feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column)))) predicted_label = self.k_means_estimator.predict(feature_csr_matrix) feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0] expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix) if features_to_include: return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include) else: return expanded_data_matrix @time_it def __get_filtered_data_matrix(self, data_matrix, features_to_include): """ Removes all features except features_to_include """ filtered_data_matrix = [] for feature_vector in data_matrix: filtered_feature_vector = {} for feature_name in features_to_include: filtered_feature_vector[feature_name] = feature_vector[feature_name] filtered_data_matrix.append(filtered_feature_vector) return filtered_data_matrix @time_it def __get_expanded_data_matrix(self, data_matrix): """ Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new modified copy is returned. """ feature_names = self.vectorizer.get_feature_names() expanded_data_matrix = copy.deepcopy(data_matrix) for feature_vector in expanded_data_matrix: for feature_name in feature_names: if feature_name not in feature_vector: feature_vector[feature_name] = 0 return expanded_data_matrix @time_it def predict_labels_for_data(self, file_path, features_to_include=None): """ This function reads the tweets of different users from the file at file_path and assigns the closest cluster center to each user. It returns list of tuples of (user_id,predicted_label,latitude, longitude). """ data_matrix = self.__get_data_matrix_from_file(file_path) return self.__get_predicted_labels(data_matrix, features_to_include)
print "output_file -- path to save the k-means model" exit(1) mfcc_csv_file = sys.argv[1] output_file = sys.argv[3] cluster_num = int(sys.argv[2]) mfcc_vectors = np.genfromtxt(mfcc_csv_file, delimiter=";") kmeans_model = KMeans(n_clusters=cluster_num, init='k-means++', n_init=10) kmeans_model.fit(mfcc_vectors) pickle.dump(kmeans_model, open('kmeans_model.pickle', 'wb')) print "K-means trained successfully!" # create_kmeans.py mfcc_file = "HVC2403.mfcc.csv" kmeans_model = pickle.load(open('kmeans_model.pickle', 'rb')) array = np.genfromtxt(mfcc_file, delimiter=";") print len(array) words = kmeans_model.predict(np.genfromtxt(mfcc_file, delimiter=";")) print "length of words in this audio " + str(len(words)) print "words " + str(words) freq_per_cluster = np.bincount(words) print "freq_per_cluster " + str( len(freq_per_cluster)) + " " + str(freq_per_cluster) non_zero_clusters = np.nonzero(freq_per_cluster)[0] print "non zero cluster freq " + str( zip(non_zero_clusters, freq_per_cluster[non_zero_clusters]))