def get_data_social(self, ids) : ''' Read the social features from the database. ''' # data = self.get_social_features(ids) # First get some aggregated values boards_info = self.get_boards_info() repinned_info = self.get_repinned_info() uncateg, categ_entropy = self.get_users_categories_features() query = """SELECT p.id as pin_id, u.id as user_id, p.nComments as comments, p.category as category, p.description as description, p.isRepin as is_repin, p.date as date, u.gender as gender, u.nFollowers as followers, u.nFollowing as following, u.nPins as pins, u.nBoards as boards, (u.website != "null") as has_website, p.board_id as board_id FROM pins p JOIN users u ON p.user_id = u.id""" # Make query, get results and represent as map {pin_id: data} for quick access c = self.db.cursor() c.execute(query) rows_map = {row[0]: row[1:] for row in c.fetchall()} c.close() # Store concepts as a dict per row (pin) data = [] for pin_id in ids: (user_id, ncomments, categ, desc, is_repin, date, gender, nfollowers, nfollowing, npins, nboards, has_web, board_id) = rows_map[pin_id] f = {} # Convert to string to emphasize that this feature is categorical # f["ncomments"] = ncomments f["category"] = categ f["description_len"] = len(desc) f["is_repin"] = is_repin f["gender"] = gender # f["user_followers"] = nfollowers f["user_following"] = nfollowing f["users_pins"] = npins f["users_boards"] = nboards f["has_website"] = has_web f["is_product"] = (1 if '$' in desc else 0) f["day_of_the_week"] = (date.strftime("%a") if (date) else "") if nfollowers == 0 : nfollowers = 1 # f["follow_ratio"] = float(nfollowing)/nfollowers board_pins, board_followers = boards_info[board_id] f["board_pins"] = board_pins # Total pins of the board # f["board_followers"] = board_followers # Total followers of the board f["category_entropy"] = categ_entropy[user_id] f["uncategorized"] = uncateg[user_id] f["repinned"] = repinned_info[user_id] data.append(f) # data = data[0:4,:] # Convert categorical features to numerical representation vec = DictVectorizer() data = vec.fit_transform(data).toarray() return vec.get_feature_names(), data
class KMeansEstimator: """ This class reads the tweets of users from a file and builds cluster centers on that data. It also provides method for finding the closest cluster center of unseen data. """ ADJECTIVE = 'JJ' """ Feature keys used in clustering... """ POLARITY_FEATURE_KEY = 'polarity' SUBJECTIVITY_FEATURE_KEY = 'subjectivity' TWEET_COUNT_FEATURE_KEY = 'tweetCount' """ Features not considered for clustering... """ USER_ID_FEATURE_KEY = 'userId' LONGITUDE_FEATURE_KEY = 'longitude' LATITUDE_FEATURE_KEY = 'latitude' """ Predicted label feature name. """ LABEL_FEATURE_KEY = 'label' RELEVENT_FEATURE_LIST = [ USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY ] def __init__(self, tweet_file_path, no_of_clusters): """ The constructor reads csv file and builds the data matrix. """ self.np_extractor = ConllExtractor() self.pos_tagger = NLTKTagger() self.tweet_file_path = tweet_file_path self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) self.vectorizer = DictVectorizer(sparse=True) self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters) @time_it def __get_data_matrix_from_file(self, tweet_file_path): """ Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list of all feature vectors. """ file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',') next(file_reader) data_matrix = [] for row in file_reader: logging.info("Extracting features for user_id:%s", row[0]) feature_vector = {} feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0]) feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2]) feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3]) feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4]) feature_vector.update( self.__get_features_from_tweet_text(row[1].decode('utf-8'))) data_matrix.append(feature_vector) logging.info("Successfully extracted features for user_id:%s", row[0]) return data_matrix @time_it def __get_features_from_tweet_text(self, tweet_text): """This function returns the following features from the tweet text: - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature. - Subjectivity and polarity as determined by TextBlob. :returns: (key,value) map of all features found. """ text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger) adjective_map = dict( Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE))) polarity = text_blob.sentiment[0] subjectivity = text_blob.sentiment[1] return dict( adjective_map.items() + { self.POLARITY_FEATURE_KEY: polarity, self.SUBJECTIVITY_FEATURE_KEY: subjectivity }.items()) @time_it def __get_clustering_data_matrix(self, data_matrix): """ This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from the data matrix and returns a copy of the data matrix. """ data_matrix_copy = copy.deepcopy(data_matrix) for feature_vector in data_matrix_copy: feature_vector.pop(self.USER_ID_FEATURE_KEY) feature_vector.pop(self.LATITUDE_FEATURE_KEY) feature_vector.pop(self.LONGITUDE_FEATURE_KEY) return data_matrix_copy @time_it def perform_clustering(self, features_to_include=None): """ This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at "tweet_file_path". It returns list of feature vector, where each feature vector contains only "features_to_include" or all features if "features_to_include" is None. """ clustering_data_matrix = self.__get_clustering_data_matrix( self.data_matrix) transformed_data_matrix = self.vectorizer.fit_transform( clustering_data_matrix) self.k_means_estimator.fit(transformed_data_matrix, y=None) return self.__get_predicted_labels(self.data_matrix, features_to_include) @time_it def __get_predicted_labels(self, data_matrix, features_to_include): """ Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The data matrix is modified in place. It returns a new copy of data_matrix with "features_to_include" features. """ feature_names = self.vectorizer.get_feature_names() for feature_vector in data_matrix: row = [0] * len(feature_names) column = range(len(feature_names)) data = map( lambda feature_name: feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names) feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column)))) predicted_label = self.k_means_estimator.predict( feature_csr_matrix) feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0] expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix) if features_to_include: return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include) else: return expanded_data_matrix @time_it def __get_filtered_data_matrix(self, data_matrix, features_to_include): """ Removes all features except features_to_include """ filtered_data_matrix = [] for feature_vector in data_matrix: filtered_feature_vector = {} for feature_name in features_to_include: filtered_feature_vector[feature_name] = feature_vector[ feature_name] filtered_data_matrix.append(filtered_feature_vector) return filtered_data_matrix @time_it def __get_expanded_data_matrix(self, data_matrix): """ Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new modified copy is returned. """ feature_names = self.vectorizer.get_feature_names() expanded_data_matrix = copy.deepcopy(data_matrix) for feature_vector in expanded_data_matrix: for feature_name in feature_names: if feature_name not in feature_vector: feature_vector[feature_name] = 0 return expanded_data_matrix @time_it def predict_labels_for_data(self, file_path, features_to_include=None): """ This function reads the tweets of different users from the file at file_path and assigns the closest cluster center to each user. It returns list of tuples of (user_id,predicted_label,latitude, longitude). """ data_matrix = self.__get_data_matrix_from_file(file_path) return self.__get_predicted_labels(data_matrix, features_to_include)
id_features[id][id_location[id]] = '1' id_features[id]['feature_count'] = float(id_feature_count[id]) id_features[id]['event_count'] = id_event_count[id] id_features[id]['resource_count'] = id_resource_count[id] train_ids = sorted(id_location_train.keys()) test_ids = sorted(id_location_test.keys()) train_features = [id_features[id] for id in train_ids] test_features = [id_features[id] for id in test_ids] labels = {'0': 0, '1': 1, '2': 2} train_labels = [labels[id_severity_train[id]] for id in train_ids] test_fake_labels = [train_labels[0]] * len(test_ids) vectorizer = DictVectorizer() X_train = vectorizer.fit_transform(train_features) features = vectorizer.get_feature_names() save_train_features = False if save_train_features: np.savetxt('x_train.txt', X_train.toarray(), delimiter=',', header=','.join(features)) X_test = vectorizer.transform(test_features) #scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True) scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True) X_train = scaler.fit_transform(X_train.toarray()) X_test = scaler.transform(X_test.toarray()) do_feature_elimination = False
class KMeansEstimator: """ This class reads the tweets of users from a file and builds cluster centers on that data. It also provides method for finding the closest cluster center of unseen data. """ ADJECTIVE = 'JJ' """ Feature keys used in clustering... """ POLARITY_FEATURE_KEY = 'polarity' SUBJECTIVITY_FEATURE_KEY = 'subjectivity' TWEET_COUNT_FEATURE_KEY = 'tweetCount' """ Features not considered for clustering... """ USER_ID_FEATURE_KEY = 'userId' LONGITUDE_FEATURE_KEY = 'longitude' LATITUDE_FEATURE_KEY = 'latitude' """ Predicted label feature name. """ LABEL_FEATURE_KEY = 'label' RELEVENT_FEATURE_LIST = [USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY] def __init__(self, tweet_file_path, no_of_clusters): """ The constructor reads csv file and builds the data matrix. """ self.np_extractor = ConllExtractor() self.pos_tagger = NLTKTagger() self.tweet_file_path = tweet_file_path self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) self.vectorizer = DictVectorizer(sparse=True) self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters) @time_it def __get_data_matrix_from_file(self, tweet_file_path): """ Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list of all feature vectors. """ file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',') next(file_reader) data_matrix = [] for row in file_reader: logging.info("Extracting features for user_id:%s", row[0]) feature_vector = {} feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0]) feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2]) feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3]) feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4]) feature_vector.update(self.__get_features_from_tweet_text(row[1].decode('utf-8'))) data_matrix.append(feature_vector) logging.info("Successfully extracted features for user_id:%s", row[0]) return data_matrix @time_it def __get_features_from_tweet_text(self, tweet_text): """This function returns the following features from the tweet text: - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature. - Subjectivity and polarity as determined by TextBlob. :returns: (key,value) map of all features found. """ text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger); adjective_map = dict(Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE))) polarity = text_blob.sentiment[0] subjectivity = text_blob.sentiment[1] return dict(adjective_map.items() + {self.POLARITY_FEATURE_KEY:polarity, self.SUBJECTIVITY_FEATURE_KEY:subjectivity}.items()) @time_it def __get_clustering_data_matrix(self, data_matrix): """ This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from the data matrix and returns a copy of the data matrix. """ data_matrix_copy = copy.deepcopy(data_matrix) for feature_vector in data_matrix_copy: feature_vector.pop(self.USER_ID_FEATURE_KEY) feature_vector.pop(self.LATITUDE_FEATURE_KEY) feature_vector.pop(self.LONGITUDE_FEATURE_KEY) return data_matrix_copy @time_it def perform_clustering(self, features_to_include=None): """ This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at "tweet_file_path". It returns list of feature vector, where each feature vector contains only "features_to_include" or all features if "features_to_include" is None. """ clustering_data_matrix = self.__get_clustering_data_matrix(self.data_matrix) transformed_data_matrix = self.vectorizer.fit_transform(clustering_data_matrix) self.k_means_estimator.fit(transformed_data_matrix, y=None) return self.__get_predicted_labels(self.data_matrix, features_to_include) @time_it def __get_predicted_labels(self, data_matrix, features_to_include): """ Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The data matrix is modified in place. It returns a new copy of data_matrix with "features_to_include" features. """ feature_names = self.vectorizer.get_feature_names() for feature_vector in data_matrix: row = [0] * len(feature_names) column = range(len(feature_names)) data = map(lambda feature_name:feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names) feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column)))) predicted_label = self.k_means_estimator.predict(feature_csr_matrix) feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0] expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix) if features_to_include: return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include) else: return expanded_data_matrix @time_it def __get_filtered_data_matrix(self, data_matrix, features_to_include): """ Removes all features except features_to_include """ filtered_data_matrix = [] for feature_vector in data_matrix: filtered_feature_vector = {} for feature_name in features_to_include: filtered_feature_vector[feature_name] = feature_vector[feature_name] filtered_data_matrix.append(filtered_feature_vector) return filtered_data_matrix @time_it def __get_expanded_data_matrix(self, data_matrix): """ Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new modified copy is returned. """ feature_names = self.vectorizer.get_feature_names() expanded_data_matrix = copy.deepcopy(data_matrix) for feature_vector in expanded_data_matrix: for feature_name in feature_names: if feature_name not in feature_vector: feature_vector[feature_name] = 0 return expanded_data_matrix @time_it def predict_labels_for_data(self, file_path, features_to_include=None): """ This function reads the tweets of different users from the file at file_path and assigns the closest cluster center to each user. It returns list of tuples of (user_id,predicted_label,latitude, longitude). """ data_matrix = self.__get_data_matrix_from_file(file_path) return self.__get_predicted_labels(data_matrix, features_to_include)
id_features[id]['feature_count'] = float(id_feature_count[id]) id_features[id]['event_count'] = id_event_count[id] id_features[id]['resource_count'] = id_resource_count[id] train_ids = sorted(id_location_train.keys()) test_ids = sorted(id_location_test.keys()) train_features = [id_features[id] for id in train_ids] test_features = [id_features[id] for id in test_ids] labels = {'0':0, '1':1, '2':2} train_labels = [labels[id_severity_train[id]] for id in train_ids] test_fake_labels = [train_labels[0]] * len(test_ids) vectorizer = DictVectorizer() X_train = vectorizer.fit_transform(train_features) features = vectorizer.get_feature_names() save_train_features = False if save_train_features: np.savetxt('x_train.txt', X_train.toarray(), delimiter=',', header=','.join(features)) X_test = vectorizer.transform(test_features) #scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True) scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True) X_train = scaler.fit_transform(X_train.toarray()) X_test = scaler.transform(X_test.toarray()) do_feature_elimination = False if do_feature_elimination: estimator = RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=None, min_samples_split=16, min_samples_leaf=1, min_weight_fraction_leaf=0.0,