Exemple #1
0
	def get_data_social(self, ids) :
		'''
		Read the social features from the database.
		'''
#		data = self.get_social_features(ids)
		
		# First get some aggregated values
		boards_info = self.get_boards_info()
		repinned_info = self.get_repinned_info()
		
		uncateg, categ_entropy =  self.get_users_categories_features()

		query = """SELECT p.id as pin_id, 
							 u.id as user_id, 
							 p.nComments as comments, 
							 p.category as category, 
							 p.description as description, 
							 p.isRepin as is_repin,
							 p.date as date,
							 u.gender as gender, 
							 u.nFollowers as followers, 
							 u.nFollowing as following, 
							 u.nPins as pins,
							 u.nBoards as boards,
							 (u.website != "null") as has_website, 
							 p.board_id as board_id
							 FROM pins p JOIN users u ON p.user_id = u.id"""

		# Make query, get results and represent as map {pin_id: data} for quick access
		c = self.db.cursor()
		c.execute(query)
		rows_map = {row[0]: row[1:] for row in c.fetchall()}
		c.close()

		# Store concepts as a dict per row (pin) 
		data = [] 
		for pin_id in ids:

			(user_id, ncomments, categ, desc, is_repin, date, gender, nfollowers, nfollowing, npins, nboards, has_web, board_id) = rows_map[pin_id]

			f = {}

			# Convert to string to emphasize that this feature is categorical
#			f["ncomments"] = ncomments
			f["category"] = categ
			f["description_len"] = len(desc)
			f["is_repin"] = is_repin
			f["gender"] = gender
#			f["user_followers"] = nfollowers
			f["user_following"] = nfollowing
			f["users_pins"] = npins
			f["users_boards"] = nboards
			f["has_website"] = has_web

			f["is_product"] = (1 if '$' in desc else 0)
			f["day_of_the_week"] = (date.strftime("%a") if (date) else "")

			if nfollowers == 0 : 
				nfollowers = 1

#			f["follow_ratio"] = float(nfollowing)/nfollowers

			board_pins, board_followers = boards_info[board_id]
			f["board_pins"] = board_pins            # Total pins of the board
#			f["board_followers"] = board_followers  # Total followers of the board

			f["category_entropy"] = categ_entropy[user_id]
			f["uncategorized"] = uncateg[user_id]
			f["repinned"] = repinned_info[user_id]

			data.append(f)
			
	# 	data = data[0:4,:]
	
		# Convert categorical features to numerical representation 
		vec = DictVectorizer()
		data = vec.fit_transform(data).toarray()
		return vec.get_feature_names(), data
Exemple #2
0
class KMeansEstimator:
    """
    This class reads the tweets of users from a file and builds cluster centers on that data. It also provides
    method for finding the closest cluster center of unseen data.
    """

    ADJECTIVE = 'JJ'
    """
    Feature keys used in clustering...
    """
    POLARITY_FEATURE_KEY = 'polarity'
    SUBJECTIVITY_FEATURE_KEY = 'subjectivity'
    TWEET_COUNT_FEATURE_KEY = 'tweetCount'
    """
    Features not considered for clustering...
    """
    USER_ID_FEATURE_KEY = 'userId'
    LONGITUDE_FEATURE_KEY = 'longitude'
    LATITUDE_FEATURE_KEY = 'latitude'
    """
    Predicted label feature name.
    """
    LABEL_FEATURE_KEY = 'label'

    RELEVENT_FEATURE_LIST = [
        USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY,
        LABEL_FEATURE_KEY
    ]

    def __init__(self, tweet_file_path, no_of_clusters):
        """
        The constructor reads csv file and builds the data matrix.
        """
        self.np_extractor = ConllExtractor()
        self.pos_tagger = NLTKTagger()
        self.tweet_file_path = tweet_file_path
        self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
        self.vectorizer = DictVectorizer(sparse=True)
        self.k_means_estimator = KMeans(init="random",
                                        n_clusters=no_of_clusters)

    @time_it
    def __get_data_matrix_from_file(self, tweet_file_path):
        """
        Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list
        of all feature vectors.
        """
        file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',')
        next(file_reader)
        data_matrix = []
        for row in file_reader:
            logging.info("Extracting features for user_id:%s", row[0])
            feature_vector = {}
            feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0])
            feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2])
            feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3])
            feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4])
            feature_vector.update(
                self.__get_features_from_tweet_text(row[1].decode('utf-8')))
            data_matrix.append(feature_vector)
            logging.info("Successfully extracted features for user_id:%s",
                         row[0])
        return data_matrix

    @time_it
    def __get_features_from_tweet_text(self, tweet_text):
        """This function returns the following features from the tweet text:
        - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature.
        - Subjectivity and polarity as determined by TextBlob.
        :returns:  (key,value) map of all features found. 
        """
        text_blob = TextBlob(tweet_text,
                             np_extractor=self.np_extractor,
                             pos_tagger=self.pos_tagger)
        adjective_map = dict(
            Counter((ele[0] for ele in set(text_blob.pos_tags)
                     if ele[1] == self.ADJECTIVE)))
        polarity = text_blob.sentiment[0]
        subjectivity = text_blob.sentiment[1]
        return dict(
            adjective_map.items() + {
                self.POLARITY_FEATURE_KEY: polarity,
                self.SUBJECTIVITY_FEATURE_KEY: subjectivity
            }.items())

    @time_it
    def __get_clustering_data_matrix(self, data_matrix):
        """
        This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from
        the data matrix and returns a copy of the data matrix.
        """
        data_matrix_copy = copy.deepcopy(data_matrix)
        for feature_vector in data_matrix_copy:
            feature_vector.pop(self.USER_ID_FEATURE_KEY)
            feature_vector.pop(self.LATITUDE_FEATURE_KEY)
            feature_vector.pop(self.LONGITUDE_FEATURE_KEY)
        return data_matrix_copy

    @time_it
    def perform_clustering(self, features_to_include=None):
        """
        This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at
        "tweet_file_path".
        It returns list of feature vector, where each feature vector contains only "features_to_include" or all features
        if "features_to_include" is None.
        """
        clustering_data_matrix = self.__get_clustering_data_matrix(
            self.data_matrix)
        transformed_data_matrix = self.vectorizer.fit_transform(
            clustering_data_matrix)

        self.k_means_estimator.fit(transformed_data_matrix, y=None)
        return self.__get_predicted_labels(self.data_matrix,
                                           features_to_include)

    @time_it
    def __get_predicted_labels(self, data_matrix, features_to_include):
        """
        Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The
        data matrix is modified in place.
        It returns a new copy of data_matrix with "features_to_include" features.
        """
        feature_names = self.vectorizer.get_feature_names()
        for feature_vector in data_matrix:
            row = [0] * len(feature_names)
            column = range(len(feature_names))
            data = map(
                lambda feature_name: feature_vector[feature_name]
                if feature_name in feature_vector else 0, feature_names)
            feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column))))
            predicted_label = self.k_means_estimator.predict(
                feature_csr_matrix)
            feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0]

        expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix)
        if features_to_include:
            return self.__get_filtered_data_matrix(expanded_data_matrix,
                                                   features_to_include)
        else:
            return expanded_data_matrix

    @time_it
    def __get_filtered_data_matrix(self, data_matrix, features_to_include):
        """
        Removes all features except features_to_include
        """
        filtered_data_matrix = []
        for feature_vector in data_matrix:
            filtered_feature_vector = {}
            for feature_name in features_to_include:
                filtered_feature_vector[feature_name] = feature_vector[
                    feature_name]
            filtered_data_matrix.append(filtered_feature_vector)
        return filtered_data_matrix

    @time_it
    def __get_expanded_data_matrix(self, data_matrix):
        """
        Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new 
        modified copy is returned.
        """
        feature_names = self.vectorizer.get_feature_names()
        expanded_data_matrix = copy.deepcopy(data_matrix)
        for feature_vector in expanded_data_matrix:
            for feature_name in feature_names:
                if feature_name not in feature_vector:
                    feature_vector[feature_name] = 0
        return expanded_data_matrix

    @time_it
    def predict_labels_for_data(self, file_path, features_to_include=None):
        """
        This function reads the tweets of different users from the file at file_path and assigns the closest 
        cluster center to each user.
        It returns list of tuples of (user_id,predicted_label,latitude, longitude).
        """
        data_matrix = self.__get_data_matrix_from_file(file_path)
        return self.__get_predicted_labels(data_matrix, features_to_include)
Exemple #3
0
        id_features[id][id_location[id]] = '1'
    id_features[id]['feature_count'] = float(id_feature_count[id])
    id_features[id]['event_count'] = id_event_count[id]
    id_features[id]['resource_count'] = id_resource_count[id]

train_ids = sorted(id_location_train.keys())
test_ids = sorted(id_location_test.keys())
train_features = [id_features[id] for id in train_ids]
test_features = [id_features[id] for id in test_ids]
labels = {'0': 0, '1': 1, '2': 2}
train_labels = [labels[id_severity_train[id]] for id in train_ids]
test_fake_labels = [train_labels[0]] * len(test_ids)
vectorizer = DictVectorizer()

X_train = vectorizer.fit_transform(train_features)
features = vectorizer.get_feature_names()
save_train_features = False
if save_train_features:
    np.savetxt('x_train.txt',
               X_train.toarray(),
               delimiter=',',
               header=','.join(features))

X_test = vectorizer.transform(test_features)

#scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True)
scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_train.toarray())
X_test = scaler.transform(X_test.toarray())

do_feature_elimination = False
class KMeansEstimator:
    """
    This class reads the tweets of users from a file and builds cluster centers on that data. It also provides
    method for finding the closest cluster center of unseen data.
    """
    
    ADJECTIVE = 'JJ'
    
    """
    Feature keys used in clustering...
    """
    POLARITY_FEATURE_KEY = 'polarity'
    SUBJECTIVITY_FEATURE_KEY = 'subjectivity'
    TWEET_COUNT_FEATURE_KEY = 'tweetCount'
    """
    Features not considered for clustering...
    """
    USER_ID_FEATURE_KEY = 'userId'
    LONGITUDE_FEATURE_KEY = 'longitude'
    LATITUDE_FEATURE_KEY = 'latitude'
    
    
    """
    Predicted label feature name.
    """
    LABEL_FEATURE_KEY = 'label'

    RELEVENT_FEATURE_LIST = [USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY]
    
    def __init__(self, tweet_file_path, no_of_clusters):
        """
        The constructor reads csv file and builds the data matrix.
        """
        self.np_extractor = ConllExtractor()
        self.pos_tagger = NLTKTagger()
        self.tweet_file_path = tweet_file_path
        self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
        self.vectorizer = DictVectorizer(sparse=True)
        self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters)
        
    @time_it
    def __get_data_matrix_from_file(self, tweet_file_path):
        """
        Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list
        of all feature vectors.
        """
        file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',')
        next(file_reader)
        data_matrix = []
        for row in file_reader:
            logging.info("Extracting features for user_id:%s", row[0])
            feature_vector = {}
            feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0])
            feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2])
            feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3])
            feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4])
            feature_vector.update(self.__get_features_from_tweet_text(row[1].decode('utf-8')))
            data_matrix.append(feature_vector)
            logging.info("Successfully extracted features for user_id:%s", row[0])
        return data_matrix
    
    @time_it
    def __get_features_from_tweet_text(self, tweet_text):
        """This function returns the following features from the tweet text:
        - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature.
        - Subjectivity and polarity as determined by TextBlob.
        :returns:  (key,value) map of all features found. 
        """
        text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger);
        adjective_map = dict(Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE)))
        polarity = text_blob.sentiment[0]
        subjectivity = text_blob.sentiment[1]
        return dict(adjective_map.items() + {self.POLARITY_FEATURE_KEY:polarity, self.SUBJECTIVITY_FEATURE_KEY:subjectivity}.items())
    
    @time_it
    def __get_clustering_data_matrix(self, data_matrix):
        """
        This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from
        the data matrix and returns a copy of the data matrix.
        """
        data_matrix_copy = copy.deepcopy(data_matrix)
        for feature_vector in data_matrix_copy:
            feature_vector.pop(self.USER_ID_FEATURE_KEY)
            feature_vector.pop(self.LATITUDE_FEATURE_KEY)
            feature_vector.pop(self.LONGITUDE_FEATURE_KEY)
        return data_matrix_copy


    @time_it
    def perform_clustering(self, features_to_include=None):
        """
        This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at
        "tweet_file_path".
        It returns list of feature vector, where each feature vector contains only "features_to_include" or all features
        if "features_to_include" is None.
        """
        clustering_data_matrix = self.__get_clustering_data_matrix(self.data_matrix)
        transformed_data_matrix = self.vectorizer.fit_transform(clustering_data_matrix)
        
        self.k_means_estimator.fit(transformed_data_matrix, y=None)
        return self.__get_predicted_labels(self.data_matrix, features_to_include)

    @time_it    
    def __get_predicted_labels(self, data_matrix, features_to_include):
        """
        Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The
        data matrix is modified in place.
        It returns a new copy of data_matrix with "features_to_include" features.
        """
        feature_names = self.vectorizer.get_feature_names()
        for feature_vector in data_matrix:
            row = [0] * len(feature_names)
            column = range(len(feature_names))
            data = map(lambda feature_name:feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names)
            feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column))))
            predicted_label = self.k_means_estimator.predict(feature_csr_matrix)
            feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0]
        
        expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix)
        if features_to_include:
            return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include)
        else:
            return expanded_data_matrix

    @time_it
    def __get_filtered_data_matrix(self, data_matrix, features_to_include):
        """
        Removes all features except features_to_include
        """
        filtered_data_matrix = []
        for feature_vector in data_matrix:
            filtered_feature_vector = {}
            for feature_name in features_to_include:
                filtered_feature_vector[feature_name] = feature_vector[feature_name]
            filtered_data_matrix.append(filtered_feature_vector)
        return filtered_data_matrix
    
    @time_it
    def __get_expanded_data_matrix(self, data_matrix):
        """
        Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new 
        modified copy is returned.
        """
        feature_names = self.vectorizer.get_feature_names()
        expanded_data_matrix = copy.deepcopy(data_matrix)
        for feature_vector in expanded_data_matrix:
            for feature_name in feature_names:
                if feature_name not in feature_vector:
                    feature_vector[feature_name] = 0
        return expanded_data_matrix
    
    @time_it
    def predict_labels_for_data(self, file_path, features_to_include=None):
        """
        This function reads the tweets of different users from the file at file_path and assigns the closest 
        cluster center to each user.
        It returns list of tuples of (user_id,predicted_label,latitude, longitude).
        """
        data_matrix = self.__get_data_matrix_from_file(file_path)
        return self.__get_predicted_labels(data_matrix, features_to_include)
Exemple #5
0
    id_features[id]['feature_count'] = float(id_feature_count[id])
    id_features[id]['event_count'] = id_event_count[id]
    id_features[id]['resource_count'] = id_resource_count[id]
    
    
train_ids = sorted(id_location_train.keys())
test_ids = sorted(id_location_test.keys())
train_features = [id_features[id] for id in train_ids]
test_features = [id_features[id] for id in test_ids]
labels = {'0':0, '1':1, '2':2}
train_labels = [labels[id_severity_train[id]] for id in train_ids]
test_fake_labels = [train_labels[0]] * len(test_ids)
vectorizer = DictVectorizer()

X_train = vectorizer.fit_transform(train_features)
features = vectorizer.get_feature_names()
save_train_features = False
if save_train_features:
    np.savetxt('x_train.txt', X_train.toarray(), delimiter=',', header=','.join(features))

X_test = vectorizer.transform(test_features)

#scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True)
scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_train.toarray())
X_test = scaler.transform(X_test.toarray())

do_feature_elimination = False
if do_feature_elimination:
    estimator =  RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=None, 
                                 min_samples_split=16, min_samples_leaf=1, min_weight_fraction_leaf=0.0,