Beispiel #1
0
    def __init__(self, settings):
        """
        Initializing class variables.
        """
        # the mention network that will store inferred locations in node_data
        self.mention_network = MultiLocationMethod.dataset.bi_mention_network()
        self.nodes = set(self.mention_network.nodes())

        #self.u_n, self.u_star are the sets of users with unknown and known locations respectively.
        self.u_n = set()
        self.u_star = set()

        #the set of all known venues
        self.venues = set()

        #list of all locations, and the co-occurences with a user.
        self.psi = Counter()

        #alpha and beta are the coefficients for eq.1 as per the paper
        self.alpha = -0.55
        self.beta = 0.0045

        #K is the total number of tweeting relationships
        self.K = 0

        #N_squared is the total number of user pairs
        self.N_squared = 0

        #S is the number of following relationships
        self.S = 0

        #geocoder is a forward/reverse geocoder for location -> lat/long and lat/lon -> location.
        if 'location_source' in settings:
            self.geocoder = Geocoder(dataset=settings['location_source'])
        else:
            self.geocoder = Geocoder()


        #F_r is the random following model Bernoulli distribution parameter
        self.F_r = None

        #T_r is the random tweeting model Bernoulli distribution parameter
        self.T_r = Counter()

        #mu and nu are the model selectors according to a bernoulli distribution
        self.mu = defaultdict(bool)
        self.nu = defaultdict(bool)

        #the multi-location list generated by the MLP
        self.user_multi_locations = defaultdict(list)

        #runs the model, populates all the variables and generates user_multi_locations
        self.run_model()
Beispiel #2
0
    def __init__(self, settings):
        """
        Initializing class variables.
        """
        # the mention network that will store inferred locations in node_data
        self.mention_network = MultiLocationMethod.dataset.bi_mention_network()
        self.nodes = set(self.mention_network.nodes())

        #self.u_n, self.u_star are the sets of users with unknown and known locations respectively.
        self.u_n = set()
        self.u_star = set()

        #the set of all known venues
        self.venues = set()

        #list of all locations, and the co-occurences with a user.
        self.psi = Counter()

        #alpha and beta are the coefficients for eq.1 as per the paper
        self.alpha = -0.55
        self.beta = 0.0045

        #K is the total number of tweeting relationships
        self.K = 0

        #N_squared is the total number of user pairs
        self.N_squared = 0

        #S is the number of following relationships
        self.S = 0

        #geocoder is a forward/reverse geocoder for location -> lat/long and lat/lon -> location.
        if 'location_source' in settings:
            self.geocoder = Geocoder(dataset=settings['location_source'])
        else:
            self.geocoder = Geocoder()

        #F_r is the random following model Bernoulli distribution parameter
        self.F_r = None

        #T_r is the random tweeting model Bernoulli distribution parameter
        self.T_r = Counter()

        #mu and nu are the model selectors according to a bernoulli distribution
        self.mu = defaultdict(bool)
        self.nu = defaultdict(bool)

        #the multi-location list generated by the MLP
        self.user_multi_locations = defaultdict(list)

        #runs the model, populates all the variables and generates user_multi_locations
        self.run_model()
Beispiel #3
0
    def train_model(self, settings, dataset, model_dir):

        # Initialize the geocoder, which we'll use to resolve location strings.
        # We use the default name-to-location mapping unless the user has
        # specified otherwise.
        if 'location_source' in settings:
            self.geocoder = Geocoder(dataset=settings['location_source'])
        else:
            self.geocoder = Geocoder()


        # NOTE: The original paper used the directional friends/followers
        # network.  However, the paper was tested on a much smaller network
        # (9.8M edges), which doesn't scale when including the full network.  We
        # opt for using the bi-directional networks as these (1) provide a
        # stronger signal of social relationships and (2) significantly reduce
        # the memory requirement.
        LOGGER.debug('Loading mention network')        
        mention_network = dataset.bi_mention_network()

        # This dict will contain a mapping from user ID to an associated home
        # location, which is derived either from the location field (as in the
        # original paper), from GPS-tagged tweets, or from both
        user_to_home_loc = {}
        
        # For each of the users that we have in the network, see if we can
        # associate that user with a home location.
        all_users = set(mention_network.nodes_iter())
        
        LOGGER.debug('Calculating users with recognizable home location')
        num_users_processed = 0

        # Keep track of how many times each location occurred.  We'll filter
        # this down to only the most common locations
        location_counts = collections.Counter() 

        for user_id, home_loc in dataset.user_home_location_iter():
            
            if not user_id in all_users:
                continue
            
            # home_loc is a (lat,lon) tuple.  While this is accurate, we want to
            # coarsen the location data to decrease sparsity (i.e., more people
            # located in the same city location, despite slightly different
            # underlying lat/lon values).  Here, use the Geocoder to map the
            # lat/lon to a name and then back to a canonical lat/lon for that
            # name           
            canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1])

            location_counts[canonical_lat_lon] += 1

            user_to_home_loc[user_id] = canonical_lat_lon
            num_users_processed += 1
            if num_users_processed % 500000 == 0:
                LOGGER.debug('Processed %s of the %s users, associated %s a known location (%s)'
                             % (num_users_processed, len(all_users), len(user_to_home_loc),
                                len(user_to_home_loc) / float(num_users_processed)))

        # Iterate through the locations pruning out those that do not occur more
        # than some threshold number of times
        num_locs_removed = 0
        for lat_lon, count in location_counts.iteritems():
            if count >= 20:
                self.unique_locations.add(lat_lon)
            else:
                num_locs_removed += 1
        LOGGER.debug('Saw %d locations, %d with at least 5 users, %d to be pruned'
                     % (len(location_counts), len(self.unique_locations), num_locs_removed))


        # Remove the home locations of users whose locations aren't in the
        # pruned list of minimum-frequency locations
        num_user_home_locs_removed = 0
        for user_id, loc in user_to_home_loc.items():
            if not loc in self.unique_locations:
                del user_to_home_loc[user_id]
                num_user_home_locs_removed += 1
        LOGGER.debug('After pruning removed home locations of %d users, %d still have homes'
                     % (num_user_home_locs_removed, len(user_to_home_loc)))
                

        # Create a bi-directional mapping from locations to unique
        # numeric identifiers.  This mapping will be used when
        # representing locations in the classifier feature space and
        # when converting classifier output to specific locations
        location_to_id = {}
        for loc in self.unique_locations:
            id_ = len(location_to_id)
            location_to_id[loc] = id_
            self.id_to_location[id_] = loc

        # Associate each location with its set of features
        n = len(self.unique_locations)

        # Each location has 7 features associated with it for classifying a
        # user's location.  The seven features per location are arranged next to
        # each other in the feature space.
        feature_offset = 0
        for loc in self.unique_locations:
            # Feat1: it's population bin (size approx.)
            self.pop_bin_feature_indices[loc] = feature_offset
            # Feat2: the number of reciprocal friends
            self.reciprocal_feature_indices[loc] = feature_offset + 1
            # Feat3-7: the bins indicating how many friends were in reciprocal
            # triads in that city
            for bin_num in range(0, 5):
                feat = "%s,%s:%s" % (loc[0], loc[1], bin_num)
                self.triad_feature_indices[feat] = feature_offset + bin_num + 2
            # Increment the feature offset so the next city's features don't
            # collide with this city's indices 
            feature_offset += 7
        
        # Set the total number of features seen 
        self.total_num_features = feature_offset
        LOGGER.debug('Saw %d unique locations, %d total featurs' 
                     % (len(self.unique_locations), feature_offset))

        LOGGER.debug('Associated %s of the %s users with a known location (%s unique)'
                     % (len(user_to_home_loc), len(all_users), len(self.unique_locations)))

        # The list of locations for each corresponding user in X
        B = []
        
        # Train the classifier based on users with known home locations
        LOGGER.debug("Generating feature vectors for training")
        X = scipy.sparse.lil_matrix((len(user_to_home_loc), 
                                     self.total_num_features), dtype=numpy.float64)
	print X
        row = 0
        total_nz = 0
        for user_id, location in user_to_home_loc.iteritems():

            # Skip users whose locations were omitted due to frequency filtering
            # or who have home locations but are not in the mention network
            #if not location in self.unique_locations or not user_id in all_users:
            #    continue

            # Fill the row in the matrix corresponding to this user's features
            nz = self.fill_user_vector(user_id, mention_network,
                                       user_to_home_loc, X, row)
            total_nz += nz
            
            # Get the index of this user's location
            location_id = location_to_id[location]
            B.append(location_id)
            row += 1
        X = X.tocsr()
        #X = X.toarray()

        LOGGER.debug("Generated training data for %d users, %d nz features, %f on average"
                     % (row, total_nz, float(total_nz) / row))
        

        # Convert the location list into a numpy array for use with scikit
        Y = numpy.asarray(B)

        if len(X.nonzero()[0]) == 0:
            LOGGER.warning("Too little training data seen and no user had non-zero feature "+
                           "values.  Cowardly aborting classification")
        else:
            # Use SVM classifier with a linear kernel.
            #
            # NOTE NOTE NOTE NOTE
            #
            # The original paper uses an RBF kernel with their SVM.  However,
            # this proved impossibly slow during testing, so a linear kernel was
            # used instead.  
            #
            # NOTE NOTE NOTE NOTE
            #
            # slow: self.location_classifier = svm.SVC(kernel='rbf')
            #self.location_classifier = svm.LinearSVC(dual=False)
            #self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000)
            #self.location_classifier = naive_bayes.BernoulliNB()
            self.location_classifier = svm.LinearSVC(dual=False, loss='l2', penalty="l2",
                                                     tol=1e-2)

            # Note: we expect the vector representations to be sparse, so avoid mean
            # scaling since it would create dense vectors, which would blow up the
            # memory consumption of the model
            self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False)
            
            # Learn the scaling parameters and then rescale the input            
            LOGGER.debug("Scaling feature vectors for training")
            X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64))

            LOGGER.debug("Training classifier")
            self.location_classifier.fit(X_scaled, Y)
            LOGGER.debug("Finished training classifier")

            # Assign all the users some location, if we can figure it out
            users_assigned = 0
            users_seen = 0
            for user_id in all_users:
                users_seen += 1
                # If we know where to place this user, assign it to their home location
                if user_id in user_to_home_loc:
                    self.user_id_to_location[user_id] = user_to_home_loc[user_id]
                # Otherwise try to infer the location
                else:
                    location = self.infer_location(user_id, mention_network,
                                                   user_to_home_loc)
                    if not location is None:
                        self.user_id_to_location[user_id] = location
                        users_assigned += 1

                if users_seen % 100000 == 0:
                    LOGGER.debug((("Saw %d/%d users, knew location of %d, " +
                                   "inferred the location of %d (total: %d)")
                                  % (users_seen, len(all_users),
                                     len(self.user_id_to_location) - users_assigned,
                                     users_assigned,
                                     len(self.user_id_to_location))))

        LOGGER.debug((("Ultimately saw %d/%d users, knew location of %d, " +
                       "inferred the location of %d (total: %d)")
                      % (users_seen, len(all_users),
                         len(self.user_id_to_location) - users_assigned,
                         users_assigned,
                         len(self.user_id_to_location))))
                        

        # Short circuit early if the caller has specified that the model is not
        # to be saved into a directory
        if model_dir is None:
            return Wheres_Wally_Model(self.user_id_to_location)

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)         

        # Write the .tsv for human debugability too
        fh = gzip.open(os.path.join(model_dir, 'user-to-lat-lon.tsv.gz'), 'w')
        for user_id, loc in self.user_id_to_location.iteritems():
            fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1]));
        fh.close()

        return Wheres_Wally_Model(self.user_id_to_location)
Beispiel #4
0
class Wheres_Wally(GIMethod):
    def __init__(self):
        # Location is represented as a lat/lon geopy Point
        self.user_id_to_location = {}
        self.geocoder = None;
        self.unique_locations = set()
        self.id_to_location = {}

        # Mappings from feature names to their corresponding indices in a
        # feature vector
        self.pop_bin_feature_indices = {}
        self.reciprocal_feature_indices = {}
        self.triad_feature_indices = {}
        self.total_num_features = 0
        
        # The SVM classifier and feature vector scaler
        self.location_classifier = None
        self.location_vector_scaler = None



    def train_model(self, settings, dataset, model_dir):

        # Initialize the geocoder, which we'll use to resolve location strings.
        # We use the default name-to-location mapping unless the user has
        # specified otherwise.
        if 'location_source' in settings:
            self.geocoder = Geocoder(dataset=settings['location_source'])
        else:
            self.geocoder = Geocoder()


        # NOTE: The original paper used the directional friends/followers
        # network.  However, the paper was tested on a much smaller network
        # (9.8M edges), which doesn't scale when including the full network.  We
        # opt for using the bi-directional networks as these (1) provide a
        # stronger signal of social relationships and (2) significantly reduce
        # the memory requirement.
        LOGGER.debug('Loading mention network')        
        mention_network = dataset.bi_mention_network()

        # This dict will contain a mapping from user ID to an associated home
        # location, which is derived either from the location field (as in the
        # original paper), from GPS-tagged tweets, or from both
        user_to_home_loc = {}
        
        # For each of the users that we have in the network, see if we can
        # associate that user with a home location.
        all_users = set(mention_network.nodes_iter())
        
        LOGGER.debug('Calculating users with recognizable home location')
        num_users_processed = 0

        # Keep track of how many times each location occurred.  We'll filter
        # this down to only the most common locations
        location_counts = collections.Counter() 

        for user_id, home_loc in dataset.user_home_location_iter():
            
            if not user_id in all_users:
                continue
            
            # home_loc is a (lat,lon) tuple.  While this is accurate, we want to
            # coarsen the location data to decrease sparsity (i.e., more people
            # located in the same city location, despite slightly different
            # underlying lat/lon values).  Here, use the Geocoder to map the
            # lat/lon to a name and then back to a canonical lat/lon for that
            # name           
            canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1])

            location_counts[canonical_lat_lon] += 1

            user_to_home_loc[user_id] = canonical_lat_lon
            num_users_processed += 1
            if num_users_processed % 500000 == 0:
                LOGGER.debug('Processed %s of the %s users, associated %s a known location (%s)'
                             % (num_users_processed, len(all_users), len(user_to_home_loc),
                                len(user_to_home_loc) / float(num_users_processed)))

        # Iterate through the locations pruning out those that do not occur more
        # than some threshold number of times
        num_locs_removed = 0
        for lat_lon, count in location_counts.iteritems():
            if count >= 20:
                self.unique_locations.add(lat_lon)
            else:
                num_locs_removed += 1
        LOGGER.debug('Saw %d locations, %d with at least 5 users, %d to be pruned'
                     % (len(location_counts), len(self.unique_locations), num_locs_removed))


        # Remove the home locations of users whose locations aren't in the
        # pruned list of minimum-frequency locations
        num_user_home_locs_removed = 0
        for user_id, loc in user_to_home_loc.items():
            if not loc in self.unique_locations:
                del user_to_home_loc[user_id]
                num_user_home_locs_removed += 1
        LOGGER.debug('After pruning removed home locations of %d users, %d still have homes'
                     % (num_user_home_locs_removed, len(user_to_home_loc)))
                

        # Create a bi-directional mapping from locations to unique
        # numeric identifiers.  This mapping will be used when
        # representing locations in the classifier feature space and
        # when converting classifier output to specific locations
        location_to_id = {}
        for loc in self.unique_locations:
            id_ = len(location_to_id)
            location_to_id[loc] = id_
            self.id_to_location[id_] = loc

        # Associate each location with its set of features
        n = len(self.unique_locations)

        # Each location has 7 features associated with it for classifying a
        # user's location.  The seven features per location are arranged next to
        # each other in the feature space.
        feature_offset = 0
        for loc in self.unique_locations:
            # Feat1: it's population bin (size approx.)
            self.pop_bin_feature_indices[loc] = feature_offset
            # Feat2: the number of reciprocal friends
            self.reciprocal_feature_indices[loc] = feature_offset + 1
            # Feat3-7: the bins indicating how many friends were in reciprocal
            # triads in that city
            for bin_num in range(0, 5):
                feat = "%s,%s:%s" % (loc[0], loc[1], bin_num)
                self.triad_feature_indices[feat] = feature_offset + bin_num + 2
            # Increment the feature offset so the next city's features don't
            # collide with this city's indices 
            feature_offset += 7
        
        # Set the total number of features seen 
        self.total_num_features = feature_offset
        LOGGER.debug('Saw %d unique locations, %d total featurs' 
                     % (len(self.unique_locations), feature_offset))

        LOGGER.debug('Associated %s of the %s users with a known location (%s unique)'
                     % (len(user_to_home_loc), len(all_users), len(self.unique_locations)))

        # The list of locations for each corresponding user in X
        B = []
        
        # Train the classifier based on users with known home locations
        LOGGER.debug("Generating feature vectors for training")
        X = scipy.sparse.lil_matrix((len(user_to_home_loc), 
                                     self.total_num_features), dtype=numpy.float64)
	print X
        row = 0
        total_nz = 0
        for user_id, location in user_to_home_loc.iteritems():

            # Skip users whose locations were omitted due to frequency filtering
            # or who have home locations but are not in the mention network
            #if not location in self.unique_locations or not user_id in all_users:
            #    continue

            # Fill the row in the matrix corresponding to this user's features
            nz = self.fill_user_vector(user_id, mention_network,
                                       user_to_home_loc, X, row)
            total_nz += nz
            
            # Get the index of this user's location
            location_id = location_to_id[location]
            B.append(location_id)
            row += 1
        X = X.tocsr()
        #X = X.toarray()

        LOGGER.debug("Generated training data for %d users, %d nz features, %f on average"
                     % (row, total_nz, float(total_nz) / row))
        

        # Convert the location list into a numpy array for use with scikit
        Y = numpy.asarray(B)

        if len(X.nonzero()[0]) == 0:
            LOGGER.warning("Too little training data seen and no user had non-zero feature "+
                           "values.  Cowardly aborting classification")
        else:
            # Use SVM classifier with a linear kernel.
            #
            # NOTE NOTE NOTE NOTE
            #
            # The original paper uses an RBF kernel with their SVM.  However,
            # this proved impossibly slow during testing, so a linear kernel was
            # used instead.  
            #
            # NOTE NOTE NOTE NOTE
            #
            # slow: self.location_classifier = svm.SVC(kernel='rbf')
            #self.location_classifier = svm.LinearSVC(dual=False)
            #self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000)
            #self.location_classifier = naive_bayes.BernoulliNB()
            self.location_classifier = svm.LinearSVC(dual=False, loss='l2', penalty="l2",
                                                     tol=1e-2)

            # Note: we expect the vector representations to be sparse, so avoid mean
            # scaling since it would create dense vectors, which would blow up the
            # memory consumption of the model
            self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False)
            
            # Learn the scaling parameters and then rescale the input            
            LOGGER.debug("Scaling feature vectors for training")
            X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64))

            LOGGER.debug("Training classifier")
            self.location_classifier.fit(X_scaled, Y)
            LOGGER.debug("Finished training classifier")

            # Assign all the users some location, if we can figure it out
            users_assigned = 0
            users_seen = 0
            for user_id in all_users:
                users_seen += 1
                # If we know where to place this user, assign it to their home location
                if user_id in user_to_home_loc:
                    self.user_id_to_location[user_id] = user_to_home_loc[user_id]
                # Otherwise try to infer the location
                else:
                    location = self.infer_location(user_id, mention_network,
                                                   user_to_home_loc)
                    if not location is None:
                        self.user_id_to_location[user_id] = location
                        users_assigned += 1

                if users_seen % 100000 == 0:
                    LOGGER.debug((("Saw %d/%d users, knew location of %d, " +
                                   "inferred the location of %d (total: %d)")
                                  % (users_seen, len(all_users),
                                     len(self.user_id_to_location) - users_assigned,
                                     users_assigned,
                                     len(self.user_id_to_location))))

        LOGGER.debug((("Ultimately saw %d/%d users, knew location of %d, " +
                       "inferred the location of %d (total: %d)")
                      % (users_seen, len(all_users),
                         len(self.user_id_to_location) - users_assigned,
                         users_assigned,
                         len(self.user_id_to_location))))
                        

        # Short circuit early if the caller has specified that the model is not
        # to be saved into a directory
        if model_dir is None:
            return Wheres_Wally_Model(self.user_id_to_location)

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)         

        # Write the .tsv for human debugability too
        fh = gzip.open(os.path.join(model_dir, 'user-to-lat-lon.tsv.gz'), 'w')
        for user_id, loc in self.user_id_to_location.iteritems():
            fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1]));
        fh.close()

        return Wheres_Wally_Model(self.user_id_to_location)
               

    def infer_location(self, user_id, mention_network, user_to_home_loc):
        """
        Infers and returns the location of the provided users based on their
        features in the network
        """
        
        # Ensure that the model has been trained; otherwise, report an
        # empty classification
        if self.location_vector_scaler is None or self.location_classifier is None:
            return None

        # Convert the user's network-based features into a numeric vector
        X = scipy.sparse.lil_matrix((1, self.total_num_features), dtype=numpy.float64)
        self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, 0)
        X = X.tocsr()                

        # Rescale the vector according to the training data's scaling
        user_vector_scaled = self.location_vector_scaler.transform(X)
        
        # Classify the results
        location_id = self.location_classifier.predict(user_vector_scaled)[0]
                
        # Convert the index into a location
        return self.id_to_location[location_id]
        

    def fill_user_vector(self, user_id, mention_network, user_to_home_loc,
                         csr_matrix, row_to_fill):
        """         
        Creates a vector for the user and fills their data into the
        specified row in the provided matrix
        """
        feat_dict = self.create_user_vector(user_id, mention_network, 
                                            user_to_home_loc)
        nz = 0
        for col, val in feat_dict.iteritems():
            csr_matrix[row_to_fill, col] = val
            nz += 1
        return nz


    def create_user_vector(self, user_id, mention_network, user_to_home_loc):
        """
        Creates a vector to use with SciPy that represents this user's features
        """

        # The binned location features look at all the locations of this user's
        # neighbors and then provide a weight for each location according to how
        # many of the user's friends are in that location multiplied by how
        # large the city is, which is represented as one of five bins

        location_to_friends = defaultdict(list)
        location_to_followers = defaultdict(list)
        num_friends = mention_network.degree(user_id)

        # Record which friend appear in each city
        for neighbor_id in mention_network.neighbors_iter(user_id):
            if neighbor_id in user_to_home_loc:
                location_name = user_to_home_loc[neighbor_id]
                location_to_friends[location_name].append(neighbor_id)
		location_to_followers[location_name].append(neighbor_id)


        # Since the vector is expected to be very sparse, create it as a dict
        # for the indices with non-zero feature values.
        classifier_input_vector = {}
        num_non_zero_features = 0

        # Each city/location generates 7 unique features in the best performing
        # system
        for city, followers_in_city in location_to_followers.iteritems():
            n = len(followers_in_city)

            # Feature 1: the city's bin multiplied by the number of users in the
            # city
            city_bin = self.get_city_bin(n)
            pop_bin_feature_index = self.pop_bin_feature_indices[city]
            classifier_input_vector[pop_bin_feature_index] = city_bin

        for city, friends_in_city in location_to_friends.iteritems():
            n = len(friends_in_city)

            # Feature 2: the percentage of friends with reciprocal edges at that
            # location
            num_reciprocal_friends = 0
            for n1 in friends_in_city:
                if mention_network.has_edge(n1, user_id):
                    num_reciprocal_friends += 1
                    num_non_zero_features += 1
            reciprocal_feature_index = self.reciprocal_feature_indices[city]
            classifier_input_vector[reciprocal_feature_index] = num_reciprocal_friends / n
            if num_reciprocal_friends > 0:
                num_non_zero_features += 1
                    
            # Features 3-7: the number of triads in the city
            triad_counter = collections.Counter()
            for n1 in friends_in_city:
                num_triads = 0
                for n2 in friends_in_city:
                    if mention_network.has_edge(n1, n2):
                        num_triads += 1

                # Decide which bin this user is in
                triad_counter[self.get_triad_bin(num_triads)] += 1

            for bin_num, count in triad_counter.iteritems():
                feat = "%s,%s:%s" % (city[0], city[1], bin_num)
                triad_bin_feature_index = self.triad_feature_indices[feat]
                classifier_input_vector[triad_bin_feature_index] = count / num_friends
                if count > 0:
                    num_non_zero_features += 1

        return classifier_input_vector
                

    def get_triad_bin(self, num_triads):
        """
        Returns which bin this count of the number of triads should be in
        """
        # Bins in the paper [0,5,10,20,40]
        if num_triads < 5:
            return 0
        elif num_triads < 10:
            return 1
        elif num_triads < 20:
            return 2
        elif num_triads < 40:
            return 3
        else:
            return 4

    def get_city_bin(self, city_size):
        """
        Returns which bin this count of the number of triads should be in
        """
        # Bins in the paper [1,2,4,12,57054] 
        if city_size <= 1:
            return 0
        elif city_size <= 2:
            return 1
        elif city_size <= 4:
            return 2
        elif city_size <= 12:
            return 3
        # This sould be 57054, but we use any value larger than 12 to
        # avoid the edge case where a city has more than 57k users
        else: 
            return 4

    def load_model(self, model_dir, settings):
        """
        Reads in the Where's Wally model from a gzipped .tsv
        """      

        user_id_to_location = {}
        model_file = gzip.open(os.path.join(model_dir, "user-to-lat-lon.tsv.gz"), 'r')
        for line in model_file:
            cols = line.split("\t")
            user_id = cols[0]
            lat = float(cols[1])
            lon = float(cols[2])
            user_id_to_location[user_id] = (lat, lon)

        model_file.close()
        return Wheres_Wally_Model(user_id_to_location)
Beispiel #5
0
class MultiLocation(object):
    """
    MultiLocation is the implemented method from Multiple Location Profiling for Users and Relationships,
    from Social Network and Content by Rui Li, Shengjie Wang and Kevin Chen-Chuan Chang.
    """

    def __init__(self, settings):
        """
        Initializing class variables.
        """
        # the mention network that will store inferred locations in node_data
        self.mention_network = MultiLocationMethod.dataset.bi_mention_network()
        self.nodes = set(self.mention_network.nodes())

        #self.u_n, self.u_star are the sets of users with unknown and known locations respectively.
        self.u_n = set()
        self.u_star = set()

        #the set of all known venues
        self.venues = set()

        #list of all locations, and the co-occurences with a user.
        self.psi = Counter()

        #alpha and beta are the coefficients for eq.1 as per the paper
        self.alpha = -0.55
        self.beta = 0.0045

        #K is the total number of tweeting relationships
        self.K = 0

        #N_squared is the total number of user pairs
        self.N_squared = 0

        #S is the number of following relationships
        self.S = 0

        #geocoder is a forward/reverse geocoder for location -> lat/long and lat/lon -> location.
        if 'location_source' in settings:
            self.geocoder = Geocoder(dataset=settings['location_source'])
        else:
            self.geocoder = Geocoder()


        #F_r is the random following model Bernoulli distribution parameter
        self.F_r = None

        #T_r is the random tweeting model Bernoulli distribution parameter
        self.T_r = Counter()

        #mu and nu are the model selectors according to a bernoulli distribution
        self.mu = defaultdict(bool)
        self.nu = defaultdict(bool)

        #the multi-location list generated by the MLP
        self.user_multi_locations = defaultdict(list)

        #runs the model, populates all the variables and generates user_multi_locations
        self.run_model()

    def store_location_data(self):
        """
        Sets the node_data field with the relevant gold-standard location data from
        the bidirectional dataset.
        """
        num_users_seen = 0
        for user_id, loc in MultiLocationMethod.dataset.user_home_location_iter():
            if loc[0] == 0 and loc[1] == 0:
                continue
            try:
                self.mention_network.set_node_data(user_id, loc)
                self.u_star.add(user_id)
                num_users_seen += 1
                if num_users_seen % 100000 == 0:
                    logger.debug('Multilocation saw %d users' % num_users_seen)
            except KeyError:
                pass

    def find_locations(self):
        users_seen = 1
        for possible_posts in MultiLocationMethod.dataset.user_iter():
            users_seen += 1
            if users_seen % 1000000 == 0:
                logger.debug("Seen %d users" %users_seen)

            user_id = possible_posts['user_id']
            posts = possible_posts['posts']
            if len(posts) > 600: posts = posts[-600:]
            for post in posts:

                #twokenizer may be too computationally expensive here...
                #text = tokenizer(post['text'])
                text = post['text'].split()
                lc_text = []
                is_upper = []
                for s in text:
                    isup = s[0].isupper()
                    is_upper.append(isup)
                    if isup:
                        lc_text.append(s.lower())
                    else:
                        lc_text.append(s)

                i = 0
                n = len(text)
                while True:
                    if i >= n:
                        break

                    if not is_upper[i]:
                        i += 1
                        continue

                    is_up1 = i + 1 < n and is_upper[i+1]
                    first_two_with_space = None
                    first_two_with_tab = None

                    if i + 2 < n and is_upper[i+2] and is_up1:
                        w1 = lc_text[i]
                        w2 = lc_text[i+1]
                        w3 = lc_text[i+2]

                        first_two_with_space = w1 + " " + w2
                        s2 = first_two_with_space + " " + w3
                        location = self.geocoder.geocode(s2)
                        if not location is None:
                            self.record_user_location(s2, location, user_id)
                            i += 3
                            continue

                        s3 = first_two_with_space + "\t" + w3
                        location = self.geocoder.geocode(s3)
                        if not location is None:
                            self.record_user_location(s3, location, user_id)
                            i += 3
                            continue

                        first_two_with_tab = w1 + "\t" + w2
                        s4 = first_two_with_tab + "\t"  + w3
                        location = self.geocoder.geocode(s4)
                        if not location is None:
                            self.record_user_location(s4, location, user_id)
                            i += 3
                            continue

                        s5 = first_two_with_tab + " " + w3
                        location = self.geocoder.geocode(s5)
                        if not location is None:
                            self.record_user_location(s5, location, user_id)
                            i += 3
                            continue

                    elif i + 1 < n and is_up1:
                        w1 = lc_text[i]
                        w2 = lc_text[i+1]

                        if first_two_with_tab is None:
                            first_two_with_tab = w1 + "\t" + w2

                        location = self.geocoder.geocode(first_two_with_tab)
                        if not location is None:
                            self.record_user_location(first_two_with_tab, location, user_id)
                            i += 2
                            continue

                        if first_two_with_space is None:
                            first_two_with_space = w1 + " " + w2
                        location = self.geocoder.geocode(first_two_with_space)
                        if not location is None:
                            self.record_user_location(first_two_with_space, location, user_id)
                            i += 2
                            continue

                    else:
                        w1 = lc_text[i]
                        location = self.geocoder.geocode(w1)
                        if not location is None:
                            self.record_user_location(w1, location, user_id)

                    i += 1

    def record_user_location(self, location_name, location, user_id):
        try:
            self.mention_network.add_edge(user_id,location_name)
            self.mention_network.set_node_data(location_name,location)
        except:
            return
        self.venues.add(location_name)
        self.psi[location] += 1
        self.T_r[user_id] += 1
        self.K += 1
        return


    def compute_coefficients(self):
        """
        Computes the coefficients for equation (1) form the paper,
        P(f<i,j>|alpha,beta,x_i,y_i) = beta*distance(x_i,y_i)^alpha
        """

        def func_to_fit(x, a, b):
                return b * x ** a

        mentions_per_distance = Counter()
        following_relationship = Counter()

        # our networks are too large to generate these coefficients on each call...
        # this is about the same number of combinations as shown in the paper...
        n = 10000000
        #random_sample = random.sample(list(self.u_star),n)
        random_sample = list(self.u_star)
        number_of_users = len(self.u_star)

        # processed_combinations = 0
        # start_time = time.time()
        #for node_u, node_v in combinations(random_sample,2):
        for i in range(0,n):
            node_u, node_v = (random_sample[random.randint(0,number_of_users-1)],random_sample[random.randint(0,number_of_users-1)])
            if node_u == node_v: continue
            # if processed_combinations % 1000000 == 0:
            #     logger.debug("Took %f to process %d combinations..." % ((time.time() - start_time), processed_combinations))
            # processed_combinations += 1
            l_u = self.mention_network.node_data(node_u)
            l_v = self.mention_network.node_data(node_v)
            distance = round(haversine(l_u,l_v,miles=True),0)
            if distance > 10000:
                continue
            mentions_per_distance[distance] += 1.0
            self.N_squared += 1.0
            if self.mention_network.has_edge(node_u,node_v):
                following_relationship[distance] += 1.0
                self.S += 1.0

        x = list(sorted([key for key in mentions_per_distance]))
        x[0] += 1e-8
        y = []
        for key in mentions_per_distance:
            # "ratio of the number of pairs that have following relationship to the total number of pairs in the d_th bucket"
            mentions = mentions_per_distance[key]
            if mentions == 0:
                x.remove(key)
                continue
            following = following_relationship[key]
            ratio = following/mentions
            y.append(ratio)

        solutions = curve_fit(func_to_fit, x, y,p0=[-0.55,0.0045], maxfev=100000)[0]

        self.alpha = solutions[0]
        self.beta = solutions[1]
        return


    def generate_model_selector(self):
        for user in self.u_n:
            if np.random.binomial(1,
                                  self.F_r) == 1:  #generate a model selector, u according to a bernoulli distribution
                self.mu[user] = True
            else:
                self.mu[user] = False

            #normalizing K
            if np.random.binomial(1, (self.T_r[user] / self.K)) == 1:
                self.nu[user] = True
            else:
                self.nu[user] = False


    def random_following_model(self, user):
        """
        If mu = 1, we choose the random following model using p(f<i,j> == 1 | F_r)
        to decide if the location of a neighbor of the user is a possible location.
        """
        for neighbor in self.mention_network.neighbors_iter(user):
            if neighbor not in self.u_star:
                continue
            elif np.random.binomial(1, self.F_r):
                self.user_multi_locations[user].append(self.mention_network.node_data(neighbor))
        return


    def following_model(self, user):
        """
        If mu = 0, we decide whether there is f<i,j> based on the location-based following model as shown
        in eq. 1
        """
        #(note: this is almost the same as the Backstrom paper, thus I'll ignore generating
        #the theta values and just calculate max probability)
        def calculate_probability(l_u, l_v):
            """
            Calculates the probability, P(f<i,j>|alpha,beta,location_1,location_2)
            """
            try:
                return self.beta * (abs(haversine(l_u, l_v))) ** (self.alpha)
            except:
                #this needs to be changed to a very small value....
                return self.beta * (0.00000001) ** self.alpha

        best_log_probability = float('-inf')
        best_location = None
        for neighbor_u in self.mention_network.neighbors_iter(user):
            log_probability = 0
            if neighbor_u not in self.u_star:
                continue
            for neighbor_v in self.mention_network.neighbors_iter(neighbor_u):
                if neighbor_v not in self.u_star:
                    continue
                else:
                    l_u = self.mention_network.node_data(neighbor_u)
                    l_v = self.mention_network.node_data(neighbor_v)
                    plu_lv = calculate_probability(l_u, l_v)
                    try:
                        log_gamma_lu = math.log((plu_lv / (1 - plu_lv)))
                    except ValueError:
                        #in the case where l_u == l_v, then plu_lv --> 0 and log(1) = 0,
                        #thus this exception should be valid.
                        log_gamma_lu = 0
                    log_probability += log_gamma_lu
            if log_probability > best_log_probability:
                best_log_probability = log_probability
                best_location = self.mention_network.node_data(neighbor_u)
        if best_location:
            self.user_multi_locations[user].append(best_location)
        return


    def random_tweeting_model(self, user):
        for venue in self.mention_network.neighbors_iter(user):
            if venue not in self.venues:
                continue
            elif np.random.binomial(1, self.T_r[user]):
                self.user_multi_locations[user].append(self.mention_network.node_data(venue))
        return


    def tweeting_model(self, user):
        best_probability = float("-inf")
        best_venue = None

        for venue in self.mention_network.neighbors_iter(user):
            if venue not in self.venues:
                continue
            probability = self.psi[venue]
            if best_probability < probability:
                best_probability = probability
                best_venue = venue

        if best_venue:
            self.user_multi_locations[user].append(self.mention_network.node_data(best_venue))

        return


    def run_model(self):
        """
        run_model generates the values for all the initialized class variables, and
        follows the MLP algorithm described in the paper to infer locations for
        users.
        """

        #NOTE: K is not normalized to save computations, and is normalized on the fly in "generate_model_selector"
        #self.populate_mention_network()

        logger.debug("Variables have been initialized. Starting the model.")
        logger.debug("Storing location data...")
        self.store_location_data()
        self.u_n = self.nodes.difference(self.u_star)
        logger.debug("Location data stored!")


        logger.debug("Starting to compute the coefficients for the model...")
        #calculates the coefficients to be used in eq.1, alpha and beta
        self.compute_coefficients()
        logger.debug("Coefficients have been calculated. Alpha: %f and beta: %f." %(self.alpha, self.beta))

        logger.debug("Finding venue data..")
        self.find_locations()

        for venue in self.psi:
                self.psi[venue] /= self.K

        logger.debug("Finished finding venue data! %d venues found!" % len(self.venues))

        #self.N_squared = len(self.mention_network.edges_())
        #p(f<i,j> = 1 | F_r) = S / N^2
        self.F_r = (self.S / self.N_squared)

        #Section 4.4, generate model selector based on bernoulli distributions using T_r and F_r
        logger.debug("Generating model selectors...")
        self.generate_model_selector()
        logger.debug("Model selectors have been generated!")

        logger.debug("Starting to find user locations...")

        for user in self.u_n:
            if self.mu[user]:
                self.random_following_model(user)
            else:
                self.following_model(user)

            if self.nu[user]:
                self.random_tweeting_model(user)
            else:
                self.tweeting_model(user)

        logger.debug("Finished finding user locations...")

        for user in self.user_multi_locations:
            location_list = self.user_multi_locations[user]
            location = self.get_geometric_mean(location_list)
            self.mention_network.set_node_data(user,location)


    def return_network(self):
        return self.mention_network

    
    def get_geometric_mean(self, locations):
        """
        Locates the geometric mean of a list of locations, taken from David Jurgen's implementation,
        with less than three locations a random location is selected, else construct a geometric mean.
        """

        n = len(locations)

        # The geometric median is only defined for n > 2 points, so just return
        # an arbitrary point if we have fewer
        if n < 2:
            return locations[np.random.randint(0, n)]

        min_distance_sum = 10000000
        median = None  # Point type

        # Loop through all the points, finding the point that minimizes the
        # geodetic distance to all other points.  By construction median will
        # always be assigned to some non-None value by the end of the loop.
        for i in range(0, n):
            p1 = locations[i]
            dist_sum = 0
            for j in range(0, n):
                p2 = locations[j]
                # Skip self-comparison
                if i == j:
                    continue
                dist = haversine(p1, p2)
                dist_sum += dist

                # Short-circuit early if it's clear that this point cannot be
                # the median since it does not minimize the distance sum
                if dist_sum > min_distance_sum:
                    break

            if dist_sum < min_distance_sum:
                min_distance_sum = dist_sum
                median = p1

        return median
Beispiel #6
0
    def train_model(self, settings, dataset, model_dir):

        # Initialize the geocoder, which we'll use to resolve location strings.
        # We use the default name-to-location mapping unless the user has
        # specified otherwise.
        if "location_source" in settings:
            self.geocoder = Geocoder(dataset=settings["location_source"])
        else:
            self.geocoder = Geocoder()

        # NOTE: The original paper used the directional friends/followers
        # network.  However, the paper was tested on a much smaller network
        # (9.8M edges), which doesn't scale when including the full network.  We
        # opt for using the bi-directional networks as these (1) provide a
        # stronger signal of social relationships and (2) significantly reduce
        # the memory requirement.
        LOGGER.debug("Loading mention network")
        mention_network = dataset.bi_mention_network()

        # This dict will contain a mapping from user ID to an associated home
        # location, which is derived either from the location field (as in the
        # original paper), from GPS-tagged tweets, or from both
        user_to_home_loc = {}

        # For each of the users that we have in the network, see if we can
        # associate that user with a home location.
        all_users = set(mention_network.nodes_iter())

        LOGGER.debug("Calculating users with recognizable home location")
        num_users_processed = 0

        # Keep track of how many times each location occurred.  We'll filter
        # this down to only the most common locations
        location_counts = collections.Counter()

        for user_id, home_loc in dataset.user_home_location_iter():

            if not user_id in all_users:
                continue

            # home_loc is a (lat,lon) tuple.  While this is accurate, we want to
            # coarsen the location data to decrease sparsity (i.e., more people
            # located in the same city location, despite slightly different
            # underlying lat/lon values).  Here, use the Geocoder to map the
            # lat/lon to a name and then back to a canonical lat/lon for that
            # name
            canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1])

            location_counts[canonical_lat_lon] += 1

            user_to_home_loc[user_id] = canonical_lat_lon
            num_users_processed += 1
            if num_users_processed % 500000 == 0:
                LOGGER.debug(
                    "Processed %s of the %s users, associated %s a known location (%s)"
                    % (
                        num_users_processed,
                        len(all_users),
                        len(user_to_home_loc),
                        len(user_to_home_loc) / float(num_users_processed),
                    )
                )

        # Iterate through the locations pruning out those that do not occur more
        # than some threshold number of times
        num_locs_removed = 0
        for lat_lon, count in location_counts.iteritems():
            if count >= 20:
                self.unique_locations.add(lat_lon)
            else:
                num_locs_removed += 1
        LOGGER.debug(
            "Saw %d locations, %d with at least 5 users, %d to be pruned"
            % (len(location_counts), len(self.unique_locations), num_locs_removed)
        )

        # Remove the home locations of users whose locations aren't in the
        # pruned list of minimum-frequency locations
        num_user_home_locs_removed = 0
        for user_id, loc in user_to_home_loc.items():
            if not loc in self.unique_locations:
                del user_to_home_loc[user_id]
                num_user_home_locs_removed += 1
        LOGGER.debug(
            "After pruning removed home locations of %d users, %d still have homes"
            % (num_user_home_locs_removed, len(user_to_home_loc))
        )

        # Create a bi-directional mapping from locations to unique
        # numeric identifiers.  This mapping will be used when
        # representing locations in the classifier feature space and
        # when converting classifier output to specific locations
        location_to_id = {}
        for loc in self.unique_locations:
            id_ = len(location_to_id)
            location_to_id[loc] = id_
            self.id_to_location[id_] = loc

        # Associate each location with its set of features
        n = len(self.unique_locations)

        # Each location has 7 features associated with it for classifying a
        # user's location.  The seven features per location are arranged next to
        # each other in the feature space.
        feature_offset = 0
        for loc in self.unique_locations:
            # Feat1: it's population bin (size approx.)
            self.pop_bin_feature_indices[loc] = feature_offset
            # Feat2: the number of reciprocal friends
            self.reciprocal_feature_indices[loc] = feature_offset + 1
            # Feat3-7: the bins indicating how many friends were in reciprocal
            # triads in that city
            for bin_num in range(0, 5):
                feat = "%s,%s:%s" % (loc[0], loc[1], bin_num)
                self.triad_feature_indices[feat] = feature_offset + bin_num + 2
            # Increment the feature offset so the next city's features don't
            # collide with this city's indices
            feature_offset += 7

        # Set the total number of features seen
        self.total_num_features = feature_offset
        LOGGER.debug("Saw %d unique locations, %d total featurs" % (len(self.unique_locations), feature_offset))

        LOGGER.debug(
            "Associated %s of the %s users with a known location (%s unique)"
            % (len(user_to_home_loc), len(all_users), len(self.unique_locations))
        )

        # The list of locations for each corresponding user in X
        B = []

        # Train the classifier based on users with known home locations
        LOGGER.debug("Generating feature vectors for training")
        X = scipy.sparse.lil_matrix((len(user_to_home_loc), self.total_num_features), dtype=numpy.float64)
        print X
        row = 0
        total_nz = 0
        for user_id, location in user_to_home_loc.iteritems():

            # Skip users whose locations were omitted due to frequency filtering
            # or who have home locations but are not in the mention network
            # if not location in self.unique_locations or not user_id in all_users:
            #    continue

            # Fill the row in the matrix corresponding to this user's features
            nz = self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, row)
            total_nz += nz

            # Get the index of this user's location
            location_id = location_to_id[location]
            B.append(location_id)
            row += 1
        X = X.tocsr()
        # X = X.toarray()

        LOGGER.debug(
            "Generated training data for %d users, %d nz features, %f on average"
            % (row, total_nz, float(total_nz) / row)
        )

        # Convert the location list into a numpy array for use with scikit
        Y = numpy.asarray(B)

        if len(X.nonzero()[0]) == 0:
            LOGGER.warning(
                "Too little training data seen and no user had non-zero feature "
                + "values.  Cowardly aborting classification"
            )
        else:
            # Use SVM classifier with a linear kernel.
            #
            # NOTE NOTE NOTE NOTE
            #
            # The original paper uses an RBF kernel with their SVM.  However,
            # this proved impossibly slow during testing, so a linear kernel was
            # used instead.
            #
            # NOTE NOTE NOTE NOTE
            #
            # slow: self.location_classifier = svm.SVC(kernel='rbf')
            # self.location_classifier = svm.LinearSVC(dual=False)
            # self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000)
            # self.location_classifier = naive_bayes.BernoulliNB()
            self.location_classifier = svm.LinearSVC(dual=False, loss="l2", penalty="l2", tol=1e-2)

            # Note: we expect the vector representations to be sparse, so avoid mean
            # scaling since it would create dense vectors, which would blow up the
            # memory consumption of the model
            self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False)

            # Learn the scaling parameters and then rescale the input
            LOGGER.debug("Scaling feature vectors for training")
            X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64))

            LOGGER.debug("Training classifier")
            self.location_classifier.fit(X_scaled, Y)
            LOGGER.debug("Finished training classifier")

            # Assign all the users some location, if we can figure it out
            users_assigned = 0
            users_seen = 0
            for user_id in all_users:
                users_seen += 1
                # If we know where to place this user, assign it to their home location
                if user_id in user_to_home_loc:
                    self.user_id_to_location[user_id] = user_to_home_loc[user_id]
                # Otherwise try to infer the location
                else:
                    location = self.infer_location(user_id, mention_network, user_to_home_loc)
                    if not location is None:
                        self.user_id_to_location[user_id] = location
                        users_assigned += 1

                if users_seen % 100000 == 0:
                    LOGGER.debug(
                        (
                            ("Saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)")
                            % (
                                users_seen,
                                len(all_users),
                                len(self.user_id_to_location) - users_assigned,
                                users_assigned,
                                len(self.user_id_to_location),
                            )
                        )
                    )

        LOGGER.debug(
            (
                ("Ultimately saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)")
                % (
                    users_seen,
                    len(all_users),
                    len(self.user_id_to_location) - users_assigned,
                    users_assigned,
                    len(self.user_id_to_location),
                )
            )
        )

        # Short circuit early if the caller has specified that the model is not
        # to be saved into a directory
        if model_dir is None:
            return Wheres_Wally_Model(self.user_id_to_location)

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)

        # Write the .tsv for human debugability too
        fh = gzip.open(os.path.join(model_dir, "user-to-lat-lon.tsv.gz"), "w")
        for user_id, loc in self.user_id_to_location.iteritems():
            fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1]))
        fh.close()

        return Wheres_Wally_Model(self.user_id_to_location)
Beispiel #7
0
class Wheres_Wally(GIMethod):
    def __init__(self):
        # Location is represented as a lat/lon geopy Point
        self.user_id_to_location = {}
        self.geocoder = None
        self.unique_locations = set()
        self.id_to_location = {}

        # Mappings from feature names to their corresponding indices in a
        # feature vector
        self.pop_bin_feature_indices = {}
        self.reciprocal_feature_indices = {}
        self.triad_feature_indices = {}
        self.total_num_features = 0

        # The SVM classifier and feature vector scaler
        self.location_classifier = None
        self.location_vector_scaler = None

    def train_model(self, settings, dataset, model_dir):

        # Initialize the geocoder, which we'll use to resolve location strings.
        # We use the default name-to-location mapping unless the user has
        # specified otherwise.
        if "location_source" in settings:
            self.geocoder = Geocoder(dataset=settings["location_source"])
        else:
            self.geocoder = Geocoder()

        # NOTE: The original paper used the directional friends/followers
        # network.  However, the paper was tested on a much smaller network
        # (9.8M edges), which doesn't scale when including the full network.  We
        # opt for using the bi-directional networks as these (1) provide a
        # stronger signal of social relationships and (2) significantly reduce
        # the memory requirement.
        LOGGER.debug("Loading mention network")
        mention_network = dataset.bi_mention_network()

        # This dict will contain a mapping from user ID to an associated home
        # location, which is derived either from the location field (as in the
        # original paper), from GPS-tagged tweets, or from both
        user_to_home_loc = {}

        # For each of the users that we have in the network, see if we can
        # associate that user with a home location.
        all_users = set(mention_network.nodes_iter())

        LOGGER.debug("Calculating users with recognizable home location")
        num_users_processed = 0

        # Keep track of how many times each location occurred.  We'll filter
        # this down to only the most common locations
        location_counts = collections.Counter()

        for user_id, home_loc in dataset.user_home_location_iter():

            if not user_id in all_users:
                continue

            # home_loc is a (lat,lon) tuple.  While this is accurate, we want to
            # coarsen the location data to decrease sparsity (i.e., more people
            # located in the same city location, despite slightly different
            # underlying lat/lon values).  Here, use the Geocoder to map the
            # lat/lon to a name and then back to a canonical lat/lon for that
            # name
            canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1])

            location_counts[canonical_lat_lon] += 1

            user_to_home_loc[user_id] = canonical_lat_lon
            num_users_processed += 1
            if num_users_processed % 500000 == 0:
                LOGGER.debug(
                    "Processed %s of the %s users, associated %s a known location (%s)"
                    % (
                        num_users_processed,
                        len(all_users),
                        len(user_to_home_loc),
                        len(user_to_home_loc) / float(num_users_processed),
                    )
                )

        # Iterate through the locations pruning out those that do not occur more
        # than some threshold number of times
        num_locs_removed = 0
        for lat_lon, count in location_counts.iteritems():
            if count >= 20:
                self.unique_locations.add(lat_lon)
            else:
                num_locs_removed += 1
        LOGGER.debug(
            "Saw %d locations, %d with at least 5 users, %d to be pruned"
            % (len(location_counts), len(self.unique_locations), num_locs_removed)
        )

        # Remove the home locations of users whose locations aren't in the
        # pruned list of minimum-frequency locations
        num_user_home_locs_removed = 0
        for user_id, loc in user_to_home_loc.items():
            if not loc in self.unique_locations:
                del user_to_home_loc[user_id]
                num_user_home_locs_removed += 1
        LOGGER.debug(
            "After pruning removed home locations of %d users, %d still have homes"
            % (num_user_home_locs_removed, len(user_to_home_loc))
        )

        # Create a bi-directional mapping from locations to unique
        # numeric identifiers.  This mapping will be used when
        # representing locations in the classifier feature space and
        # when converting classifier output to specific locations
        location_to_id = {}
        for loc in self.unique_locations:
            id_ = len(location_to_id)
            location_to_id[loc] = id_
            self.id_to_location[id_] = loc

        # Associate each location with its set of features
        n = len(self.unique_locations)

        # Each location has 7 features associated with it for classifying a
        # user's location.  The seven features per location are arranged next to
        # each other in the feature space.
        feature_offset = 0
        for loc in self.unique_locations:
            # Feat1: it's population bin (size approx.)
            self.pop_bin_feature_indices[loc] = feature_offset
            # Feat2: the number of reciprocal friends
            self.reciprocal_feature_indices[loc] = feature_offset + 1
            # Feat3-7: the bins indicating how many friends were in reciprocal
            # triads in that city
            for bin_num in range(0, 5):
                feat = "%s,%s:%s" % (loc[0], loc[1], bin_num)
                self.triad_feature_indices[feat] = feature_offset + bin_num + 2
            # Increment the feature offset so the next city's features don't
            # collide with this city's indices
            feature_offset += 7

        # Set the total number of features seen
        self.total_num_features = feature_offset
        LOGGER.debug("Saw %d unique locations, %d total featurs" % (len(self.unique_locations), feature_offset))

        LOGGER.debug(
            "Associated %s of the %s users with a known location (%s unique)"
            % (len(user_to_home_loc), len(all_users), len(self.unique_locations))
        )

        # The list of locations for each corresponding user in X
        B = []

        # Train the classifier based on users with known home locations
        LOGGER.debug("Generating feature vectors for training")
        X = scipy.sparse.lil_matrix((len(user_to_home_loc), self.total_num_features), dtype=numpy.float64)
        print X
        row = 0
        total_nz = 0
        for user_id, location in user_to_home_loc.iteritems():

            # Skip users whose locations were omitted due to frequency filtering
            # or who have home locations but are not in the mention network
            # if not location in self.unique_locations or not user_id in all_users:
            #    continue

            # Fill the row in the matrix corresponding to this user's features
            nz = self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, row)
            total_nz += nz

            # Get the index of this user's location
            location_id = location_to_id[location]
            B.append(location_id)
            row += 1
        X = X.tocsr()
        # X = X.toarray()

        LOGGER.debug(
            "Generated training data for %d users, %d nz features, %f on average"
            % (row, total_nz, float(total_nz) / row)
        )

        # Convert the location list into a numpy array for use with scikit
        Y = numpy.asarray(B)

        if len(X.nonzero()[0]) == 0:
            LOGGER.warning(
                "Too little training data seen and no user had non-zero feature "
                + "values.  Cowardly aborting classification"
            )
        else:
            # Use SVM classifier with a linear kernel.
            #
            # NOTE NOTE NOTE NOTE
            #
            # The original paper uses an RBF kernel with their SVM.  However,
            # this proved impossibly slow during testing, so a linear kernel was
            # used instead.
            #
            # NOTE NOTE NOTE NOTE
            #
            # slow: self.location_classifier = svm.SVC(kernel='rbf')
            # self.location_classifier = svm.LinearSVC(dual=False)
            # self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000)
            # self.location_classifier = naive_bayes.BernoulliNB()
            self.location_classifier = svm.LinearSVC(dual=False, loss="l2", penalty="l2", tol=1e-2)

            # Note: we expect the vector representations to be sparse, so avoid mean
            # scaling since it would create dense vectors, which would blow up the
            # memory consumption of the model
            self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False)

            # Learn the scaling parameters and then rescale the input
            LOGGER.debug("Scaling feature vectors for training")
            X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64))

            LOGGER.debug("Training classifier")
            self.location_classifier.fit(X_scaled, Y)
            LOGGER.debug("Finished training classifier")

            # Assign all the users some location, if we can figure it out
            users_assigned = 0
            users_seen = 0
            for user_id in all_users:
                users_seen += 1
                # If we know where to place this user, assign it to their home location
                if user_id in user_to_home_loc:
                    self.user_id_to_location[user_id] = user_to_home_loc[user_id]
                # Otherwise try to infer the location
                else:
                    location = self.infer_location(user_id, mention_network, user_to_home_loc)
                    if not location is None:
                        self.user_id_to_location[user_id] = location
                        users_assigned += 1

                if users_seen % 100000 == 0:
                    LOGGER.debug(
                        (
                            ("Saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)")
                            % (
                                users_seen,
                                len(all_users),
                                len(self.user_id_to_location) - users_assigned,
                                users_assigned,
                                len(self.user_id_to_location),
                            )
                        )
                    )

        LOGGER.debug(
            (
                ("Ultimately saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)")
                % (
                    users_seen,
                    len(all_users),
                    len(self.user_id_to_location) - users_assigned,
                    users_assigned,
                    len(self.user_id_to_location),
                )
            )
        )

        # Short circuit early if the caller has specified that the model is not
        # to be saved into a directory
        if model_dir is None:
            return Wheres_Wally_Model(self.user_id_to_location)

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)

        # Write the .tsv for human debugability too
        fh = gzip.open(os.path.join(model_dir, "user-to-lat-lon.tsv.gz"), "w")
        for user_id, loc in self.user_id_to_location.iteritems():
            fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1]))
        fh.close()

        return Wheres_Wally_Model(self.user_id_to_location)

    def infer_location(self, user_id, mention_network, user_to_home_loc):
        """
        Infers and returns the location of the provided users based on their
        features in the network
        """

        # Ensure that the model has been trained; otherwise, report an
        # empty classification
        if self.location_vector_scaler is None or self.location_classifier is None:
            return None

        # Convert the user's network-based features into a numeric vector
        X = scipy.sparse.lil_matrix((1, self.total_num_features), dtype=numpy.float64)
        self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, 0)
        X = X.tocsr()

        # Rescale the vector according to the training data's scaling
        user_vector_scaled = self.location_vector_scaler.transform(X)

        # Classify the results
        location_id = self.location_classifier.predict(user_vector_scaled)[0]

        # Convert the index into a location
        return self.id_to_location[location_id]

    def fill_user_vector(self, user_id, mention_network, user_to_home_loc, csr_matrix, row_to_fill):
        """         
        Creates a vector for the user and fills their data into the
        specified row in the provided matrix
        """
        feat_dict = self.create_user_vector(user_id, mention_network, user_to_home_loc)
        nz = 0
        for col, val in feat_dict.iteritems():
            csr_matrix[row_to_fill, col] = val
            nz += 1
        return nz

    def create_user_vector(self, user_id, mention_network, user_to_home_loc):
        """
        Creates a vector to use with SciPy that represents this user's features
        """

        # The binned location features look at all the locations of this user's
        # neighbors and then provide a weight for each location according to how
        # many of the user's friends are in that location multiplied by how
        # large the city is, which is represented as one of five bins

        location_to_friends = defaultdict(list)
        location_to_followers = defaultdict(list)
        num_friends = mention_network.degree(user_id)

        # Record which friend appear in each city
        for neighbor_id in mention_network.neighbors_iter(user_id):
            if neighbor_id in user_to_home_loc:
                location_name = user_to_home_loc[neighbor_id]
                location_to_friends[location_name].append(neighbor_id)
                location_to_followers[location_name].append(neighbor_id)

        # Since the vector is expected to be very sparse, create it as a dict
        # for the indices with non-zero feature values.
        classifier_input_vector = {}
        num_non_zero_features = 0

        # Each city/location generates 7 unique features in the best performing
        # system
        for city, followers_in_city in location_to_followers.iteritems():
            n = len(followers_in_city)

            # Feature 1: the city's bin multiplied by the number of users in the
            # city
            city_bin = self.get_city_bin(n)
            pop_bin_feature_index = self.pop_bin_feature_indices[city]
            classifier_input_vector[pop_bin_feature_index] = city_bin

        for city, friends_in_city in location_to_friends.iteritems():
            n = len(friends_in_city)

            # Feature 2: the percentage of friends with reciprocal edges at that
            # location
            num_reciprocal_friends = 0
            for n1 in friends_in_city:
                if mention_network.has_edge(n1, user_id):
                    num_reciprocal_friends += 1
                    num_non_zero_features += 1
            reciprocal_feature_index = self.reciprocal_feature_indices[city]
            classifier_input_vector[reciprocal_feature_index] = num_reciprocal_friends / n
            if num_reciprocal_friends > 0:
                num_non_zero_features += 1

            # Features 3-7: the number of triads in the city
            triad_counter = collections.Counter()
            for n1 in friends_in_city:
                num_triads = 0
                for n2 in friends_in_city:
                    if mention_network.has_edge(n1, n2):
                        num_triads += 1

                # Decide which bin this user is in
                triad_counter[self.get_triad_bin(num_triads)] += 1

            for bin_num, count in triad_counter.iteritems():
                feat = "%s,%s:%s" % (city[0], city[1], bin_num)
                triad_bin_feature_index = self.triad_feature_indices[feat]
                classifier_input_vector[triad_bin_feature_index] = count / num_friends
                if count > 0:
                    num_non_zero_features += 1

        return classifier_input_vector

    def get_triad_bin(self, num_triads):
        """
        Returns which bin this count of the number of triads should be in
        """
        # Bins in the paper [0,5,10,20,40]
        if num_triads < 5:
            return 0
        elif num_triads < 10:
            return 1
        elif num_triads < 20:
            return 2
        elif num_triads < 40:
            return 3
        else:
            return 4

    def get_city_bin(self, city_size):
        """
        Returns which bin this count of the number of triads should be in
        """
        # Bins in the paper [1,2,4,12,57054]
        if city_size <= 1:
            return 0
        elif city_size <= 2:
            return 1
        elif city_size <= 4:
            return 2
        elif city_size <= 12:
            return 3
        # This sould be 57054, but we use any value larger than 12 to
        # avoid the edge case where a city has more than 57k users
        else:
            return 4

    def load_model(self, model_dir, settings):
        """
        Reads in the Where's Wally model from a gzipped .tsv
        """

        user_id_to_location = {}
        model_file = gzip.open(os.path.join(model_dir, "user-to-lat-lon.tsv.gz"), "r")
        for line in model_file:
            cols = line.split("\t")
            user_id = cols[0]
            lat = float(cols[1])
            lon = float(cols[2])
            user_id_to_location[user_id] = (lat, lon)

        model_file.close()
        return Wheres_Wally_Model(user_id_to_location)
Beispiel #8
0
    def train_model(self, settings, dataset, model_dir=None):
        # settings in the form
        '''{ 'LCR_min_dist' : the cutoff distance to distinguish between local and non-local contacts (default = 40 km ~ 25 miles)
			 'qntl_num' : the number of quantiles (default is 10)
			 'min_geotag' : the minimum number of geotags that makes a user a target (deafault = 3)
			 'min_samples_leaf' : the maximum number of sample in a leaf of the regression tree, i.e: the minimum for the regressor to not split a leaf (default = 1000)
		   }'''
        self.min_dist = settings.pop('LCR_min_dist', 40)
        self.m = settings.pop('qntl_num', 10)
        self.min_geotag = settings.pop('min_geotag', 3)
        min_samp_leaf = settings.pop('min_samples_leaf', 1000)
        LOGGER.debug('tree')
        self.tree = DecisionTreeRegressor(
            min_samples_leaf=min_samp_leaf)  # the classifier
        LOGGER.debug('geocoder')
        #LocRes = Geocoder()
        if 'location_source' in settings:
            LocRes = Geocoder(dataset=settings['location_source'])
        else:
            LocRes = Geocoder()

        LOGGER.debug('loading mention network')
        self.X = dataset.mention_network(bidirectional=True,
                                         directed=True,
                                         weighted=True)
        #print len(self.X)
        #counter = set_counter('has at least %d geotags'%self.min_geotag) ### counter
        # adding users
        self.user_to_home_loc = {
            user: loc
            for (user, loc) in dataset.user_home_location_iter()
        }
        user_loc_list = self.user_to_home_loc.items()
        random.shuffle(user_loc_list)
        #print len(user_loc_list)
        #Take a sample from user home locations to estimate stgrEdges and actEdges
        start = time.time()
        user_loc_list = user_loc_list[:50000]
        #print 'home loc time:'
        #print len(user_loc_list)
        #fstgr = open('stgr_edges.tsv', 'w')
        c = 0
        LOGGER.debug('sampling stranger edges and actual edges')
        for uid1, loc1 in user_loc_list:
            #if c % 100 == 0:
            #	print c
            c2 = 0
            for uid2, loc2 in user_loc_list:
                if not c2 == c:
                    if self.X.has_edge(uid1, uid2):
                        self.actEdgesTuples.append((uid1, uid2))
                    distance = round(utils.distance(loc1, loc2), 1)
                    self.stgrEdges[distance] += 1
                c2 += 1
            c += 1
        #for distance in self.stgrEdges:
        #	fstgr.write(str(distance) + '\t' + str(self.stgrEdges[distance]) + '\n')
        #fstgr.close()
        #print len(self.actEdgesTuples)
        LOGGER.debug('filling network')
        for _id, loc in dataset.user_home_location_iter():
            #_id = user['user_id']
            #loc = UserProfilingMethod.dataset.user_home_location_iter()
            #loc, pd = utils.get_post_data(user['posts'])
            #l_a = utils.is_geocoded(user, self.min_geotag)
            #counter.update(loc) ### counter
            #if not self.X.__contains__(_id):
            #self.X.add_node(_id)
            if loc[0] == 0 and loc[1] == 0:
                continue
            else:
                try:
                    self.X.add_node(_id)
                except:
                    pass
            l_a = loc
            #if not l_a:	continue
            self.add_user_data(_id, l_a, {})
            le = utils.location_error(l_a, loc, LocRes)
            self.set_loc_err(_id, le)

            # remove mentions of itself
            if self.X.has_edge(_id, _id):
                self.X.rm_edge(_id, _id)

        LOGGER.debug(str(self.X.__len__()) + 'users')
        LOGGER.debug(str(self.X.size()) + 'edges')

        self.set_d_a_for_all()

        tempx = []
        tempy = []
        for u, x in self.iter_contacts():
            tempx.append(self.get_contact_vector(u, x))
            tempy.append(self.get_d_a(u, x))
        X = np.array(tempx)
        Y = np.array(tempy)
        #X = np.array([self.get_contact_vector(u,x)
        #				for u, x in self.iter_contacts()])
        #Y = np.array([self.get_d_a(u,x)
        #				for u, x in self.iter_contacts()])

        LOGGER.debug('number of relationships' + str(len(X)))

        LOGGER.debug("fitting")
        start = timeit.default_timer()
        #try:
        self.fit(X, Y)
        #except:
        #	raise RuntimeError, 'No connections to train on.'

        LOGGER.debug('done fitting tree -' +
                     str(timeit.default_timer() - start) + 'sec')

        start = timeit.default_timer()
        self.quantile_boundaries(X)
        LOGGER.debug('done setting quantile boundaries -' +
                     str(timeit.default_timer() - start) + 'sec')

        start = timeit.default_timer()
        self.fit_curves(Y)
        LOGGER.debug('done fitting curves -' +
                     str(timeit.default_timer() - start) + 'sec')

        #self.model.allActEdges = self.allActEdges
        #self.model.stgrEdges = self.stgrEdges

        self.user_to_loc = self.infer_locs()
        if model_dir is not None:
            LOGGER.debug('saving model')
            filename = os.path.join(model_dir, "user-to-lat-lon.tsv.gz")
            fh = gzip.open(filename, 'w')
            for user_id, loc in self.user_to_loc.iteritems():
                if not loc is None:
                    fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1]))
            fh.close()

        self.model = FriendlyLocation_Model(self.user_to_loc)
        return self.model
Beispiel #9
0
class MultiLocation(object):
    """
    MultiLocation is the implemented method from Multiple Location Profiling for Users and Relationships,
    from Social Network and Content by Rui Li, Shengjie Wang and Kevin Chen-Chuan Chang.
    """
    def __init__(self, settings):
        """
        Initializing class variables.
        """
        # the mention network that will store inferred locations in node_data
        self.mention_network = MultiLocationMethod.dataset.bi_mention_network()
        self.nodes = set(self.mention_network.nodes())

        #self.u_n, self.u_star are the sets of users with unknown and known locations respectively.
        self.u_n = set()
        self.u_star = set()

        #the set of all known venues
        self.venues = set()

        #list of all locations, and the co-occurences with a user.
        self.psi = Counter()

        #alpha and beta are the coefficients for eq.1 as per the paper
        self.alpha = -0.55
        self.beta = 0.0045

        #K is the total number of tweeting relationships
        self.K = 0

        #N_squared is the total number of user pairs
        self.N_squared = 0

        #S is the number of following relationships
        self.S = 0

        #geocoder is a forward/reverse geocoder for location -> lat/long and lat/lon -> location.
        if 'location_source' in settings:
            self.geocoder = Geocoder(dataset=settings['location_source'])
        else:
            self.geocoder = Geocoder()

        #F_r is the random following model Bernoulli distribution parameter
        self.F_r = None

        #T_r is the random tweeting model Bernoulli distribution parameter
        self.T_r = Counter()

        #mu and nu are the model selectors according to a bernoulli distribution
        self.mu = defaultdict(bool)
        self.nu = defaultdict(bool)

        #the multi-location list generated by the MLP
        self.user_multi_locations = defaultdict(list)

        #runs the model, populates all the variables and generates user_multi_locations
        self.run_model()

    def store_location_data(self):
        """
        Sets the node_data field with the relevant gold-standard location data from
        the bidirectional dataset.
        """
        num_users_seen = 0
        for user_id, loc in MultiLocationMethod.dataset.user_home_location_iter(
        ):
            if loc[0] == 0 and loc[1] == 0:
                continue
            try:
                self.mention_network.set_node_data(user_id, loc)
                self.u_star.add(user_id)
                num_users_seen += 1
                if num_users_seen % 100000 == 0:
                    logger.debug('Multilocation saw %d users' % num_users_seen)
            except KeyError:
                pass

    def find_locations(self):
        users_seen = 1
        for possible_posts in MultiLocationMethod.dataset.user_iter():
            users_seen += 1
            if users_seen % 1000000 == 0:
                logger.debug("Seen %d users" % users_seen)

            user_id = possible_posts['user_id']
            posts = possible_posts['posts']
            if len(posts) > 600: posts = posts[-600:]
            for post in posts:

                #twokenizer may be too computationally expensive here...
                #text = tokenizer(post['text'])
                text = post['text'].split()
                lc_text = []
                is_upper = []
                for s in text:
                    isup = s[0].isupper()
                    is_upper.append(isup)
                    if isup:
                        lc_text.append(s.lower())
                    else:
                        lc_text.append(s)

                i = 0
                n = len(text)
                while True:
                    if i >= n:
                        break

                    if not is_upper[i]:
                        i += 1
                        continue

                    is_up1 = i + 1 < n and is_upper[i + 1]
                    first_two_with_space = None
                    first_two_with_tab = None

                    if i + 2 < n and is_upper[i + 2] and is_up1:
                        w1 = lc_text[i]
                        w2 = lc_text[i + 1]
                        w3 = lc_text[i + 2]

                        first_two_with_space = w1 + " " + w2
                        s2 = first_two_with_space + " " + w3
                        location = self.geocoder.geocode(s2)
                        if not location is None:
                            self.record_user_location(s2, location, user_id)
                            i += 3
                            continue

                        s3 = first_two_with_space + "\t" + w3
                        location = self.geocoder.geocode(s3)
                        if not location is None:
                            self.record_user_location(s3, location, user_id)
                            i += 3
                            continue

                        first_two_with_tab = w1 + "\t" + w2
                        s4 = first_two_with_tab + "\t" + w3
                        location = self.geocoder.geocode(s4)
                        if not location is None:
                            self.record_user_location(s4, location, user_id)
                            i += 3
                            continue

                        s5 = first_two_with_tab + " " + w3
                        location = self.geocoder.geocode(s5)
                        if not location is None:
                            self.record_user_location(s5, location, user_id)
                            i += 3
                            continue

                    elif i + 1 < n and is_up1:
                        w1 = lc_text[i]
                        w2 = lc_text[i + 1]

                        if first_two_with_tab is None:
                            first_two_with_tab = w1 + "\t" + w2

                        location = self.geocoder.geocode(first_two_with_tab)
                        if not location is None:
                            self.record_user_location(first_two_with_tab,
                                                      location, user_id)
                            i += 2
                            continue

                        if first_two_with_space is None:
                            first_two_with_space = w1 + " " + w2
                        location = self.geocoder.geocode(first_two_with_space)
                        if not location is None:
                            self.record_user_location(first_two_with_space,
                                                      location, user_id)
                            i += 2
                            continue

                    else:
                        w1 = lc_text[i]
                        location = self.geocoder.geocode(w1)
                        if not location is None:
                            self.record_user_location(w1, location, user_id)

                    i += 1

    def record_user_location(self, location_name, location, user_id):
        try:
            self.mention_network.add_edge(user_id, location_name)
            self.mention_network.set_node_data(location_name, location)
        except:
            return
        self.venues.add(location_name)
        self.psi[location] += 1
        self.T_r[user_id] += 1
        self.K += 1
        return

    def compute_coefficients(self):
        """
        Computes the coefficients for equation (1) form the paper,
        P(f<i,j>|alpha,beta,x_i,y_i) = beta*distance(x_i,y_i)^alpha
        """
        def func_to_fit(x, a, b):
            return b * x**a

        mentions_per_distance = Counter()
        following_relationship = Counter()

        # our networks are too large to generate these coefficients on each call...
        # this is about the same number of combinations as shown in the paper...
        n = 10000000
        #random_sample = random.sample(list(self.u_star),n)
        random_sample = list(self.u_star)
        number_of_users = len(self.u_star)

        # processed_combinations = 0
        # start_time = time.time()
        #for node_u, node_v in combinations(random_sample,2):
        for i in range(0, n):
            node_u, node_v = (random_sample[random.randint(
                0, number_of_users - 1)], random_sample[random.randint(
                    0, number_of_users - 1)])
            if node_u == node_v: continue
            # if processed_combinations % 1000000 == 0:
            #     logger.debug("Took %f to process %d combinations..." % ((time.time() - start_time), processed_combinations))
            # processed_combinations += 1
            l_u = self.mention_network.node_data(node_u)
            l_v = self.mention_network.node_data(node_v)
            distance = round(haversine(l_u, l_v, miles=True), 0)
            if distance > 10000:
                continue
            mentions_per_distance[distance] += 1.0
            self.N_squared += 1.0
            if self.mention_network.has_edge(node_u, node_v):
                following_relationship[distance] += 1.0
                self.S += 1.0

        x = list(sorted([key for key in mentions_per_distance]))
        x[0] += 1e-8
        y = []
        for key in mentions_per_distance:
            # "ratio of the number of pairs that have following relationship to the total number of pairs in the d_th bucket"
            mentions = mentions_per_distance[key]
            if mentions == 0:
                x.remove(key)
                continue
            following = following_relationship[key]
            ratio = following / mentions
            y.append(ratio)

        solutions = curve_fit(func_to_fit,
                              x,
                              y,
                              p0=[-0.55, 0.0045],
                              maxfev=100000)[0]

        self.alpha = solutions[0]
        self.beta = solutions[1]
        return

    def generate_model_selector(self):
        for user in self.u_n:
            if np.random.binomial(
                    1, self.F_r
            ) == 1:  #generate a model selector, u according to a bernoulli distribution
                self.mu[user] = True
            else:
                self.mu[user] = False

            #normalizing K
            if np.random.binomial(1, (self.T_r[user] / self.K)) == 1:
                self.nu[user] = True
            else:
                self.nu[user] = False

    def random_following_model(self, user):
        """
        If mu = 1, we choose the random following model using p(f<i,j> == 1 | F_r)
        to decide if the location of a neighbor of the user is a possible location.
        """
        for neighbor in self.mention_network.neighbors_iter(user):
            if neighbor not in self.u_star:
                continue
            elif np.random.binomial(1, self.F_r):
                self.user_multi_locations[user].append(
                    self.mention_network.node_data(neighbor))
        return

    def following_model(self, user):
        """
        If mu = 0, we decide whether there is f<i,j> based on the location-based following model as shown
        in eq. 1
        """

        #(note: this is almost the same as the Backstrom paper, thus I'll ignore generating
        #the theta values and just calculate max probability)
        def calculate_probability(l_u, l_v):
            """
            Calculates the probability, P(f<i,j>|alpha,beta,location_1,location_2)
            """
            try:
                return self.beta * (abs(haversine(l_u, l_v)))**(self.alpha)
            except:
                #this needs to be changed to a very small value....
                return self.beta * (0.00000001)**self.alpha

        best_log_probability = float('-inf')
        best_location = None
        for neighbor_u in self.mention_network.neighbors_iter(user):
            log_probability = 0
            if neighbor_u not in self.u_star:
                continue
            for neighbor_v in self.mention_network.neighbors_iter(neighbor_u):
                if neighbor_v not in self.u_star:
                    continue
                else:
                    l_u = self.mention_network.node_data(neighbor_u)
                    l_v = self.mention_network.node_data(neighbor_v)
                    plu_lv = calculate_probability(l_u, l_v)
                    try:
                        log_gamma_lu = math.log((plu_lv / (1 - plu_lv)))
                    except ValueError:
                        #in the case where l_u == l_v, then plu_lv --> 0 and log(1) = 0,
                        #thus this exception should be valid.
                        log_gamma_lu = 0
                    log_probability += log_gamma_lu
            if log_probability > best_log_probability:
                best_log_probability = log_probability
                best_location = self.mention_network.node_data(neighbor_u)
        if best_location:
            self.user_multi_locations[user].append(best_location)
        return

    def random_tweeting_model(self, user):
        for venue in self.mention_network.neighbors_iter(user):
            if venue not in self.venues:
                continue
            elif np.random.binomial(1, self.T_r[user]):
                self.user_multi_locations[user].append(
                    self.mention_network.node_data(venue))
        return

    def tweeting_model(self, user):
        best_probability = float("-inf")
        best_venue = None

        for venue in self.mention_network.neighbors_iter(user):
            if venue not in self.venues:
                continue
            probability = self.psi[venue]
            if best_probability < probability:
                best_probability = probability
                best_venue = venue

        if best_venue:
            self.user_multi_locations[user].append(
                self.mention_network.node_data(best_venue))

        return

    def run_model(self):
        """
        run_model generates the values for all the initialized class variables, and
        follows the MLP algorithm described in the paper to infer locations for
        users.
        """

        #NOTE: K is not normalized to save computations, and is normalized on the fly in "generate_model_selector"
        #self.populate_mention_network()

        logger.debug("Variables have been initialized. Starting the model.")
        logger.debug("Storing location data...")
        self.store_location_data()
        self.u_n = self.nodes.difference(self.u_star)
        logger.debug("Location data stored!")

        logger.debug("Starting to compute the coefficients for the model...")
        #calculates the coefficients to be used in eq.1, alpha and beta
        self.compute_coefficients()
        logger.debug(
            "Coefficients have been calculated. Alpha: %f and beta: %f." %
            (self.alpha, self.beta))

        logger.debug("Finding venue data..")
        self.find_locations()

        for venue in self.psi:
            self.psi[venue] /= self.K

        logger.debug("Finished finding venue data! %d venues found!" %
                     len(self.venues))

        #self.N_squared = len(self.mention_network.edges_())
        #p(f<i,j> = 1 | F_r) = S / N^2
        self.F_r = (self.S / self.N_squared)

        #Section 4.4, generate model selector based on bernoulli distributions using T_r and F_r
        logger.debug("Generating model selectors...")
        self.generate_model_selector()
        logger.debug("Model selectors have been generated!")

        logger.debug("Starting to find user locations...")

        for user in self.u_n:
            if self.mu[user]:
                self.random_following_model(user)
            else:
                self.following_model(user)

            if self.nu[user]:
                self.random_tweeting_model(user)
            else:
                self.tweeting_model(user)

        logger.debug("Finished finding user locations...")

        for user in self.user_multi_locations:
            location_list = self.user_multi_locations[user]
            location = self.get_geometric_mean(location_list)
            self.mention_network.set_node_data(user, location)

    def return_network(self):
        return self.mention_network

    def get_geometric_mean(self, locations):
        """
        Locates the geometric mean of a list of locations, taken from David Jurgen's implementation,
        with less than three locations a random location is selected, else construct a geometric mean.
        """

        n = len(locations)

        # The geometric median is only defined for n > 2 points, so just return
        # an arbitrary point if we have fewer
        if n < 2:
            return locations[np.random.randint(0, n)]

        min_distance_sum = 10000000
        median = None  # Point type

        # Loop through all the points, finding the point that minimizes the
        # geodetic distance to all other points.  By construction median will
        # always be assigned to some non-None value by the end of the loop.
        for i in range(0, n):
            p1 = locations[i]
            dist_sum = 0
            for j in range(0, n):
                p2 = locations[j]
                # Skip self-comparison
                if i == j:
                    continue
                dist = haversine(p1, p2)
                dist_sum += dist

                # Short-circuit early if it's clear that this point cannot be
                # the median since it does not minimize the distance sum
                if dist_sum > min_distance_sum:
                    break

            if dist_sum < min_distance_sum:
                min_distance_sum = dist_sum
                median = p1

        return median