Python Geocoder.canonicalize Beispiele

Programmiersprache: Python

Namespace / Paketname: geolocate.geocoder

Klasse / Typ: Geocoder

Methode / Funktion: canonicalize

Beispiele auf hotexamples.com: 2

Python Geocoder.canonicalize - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die geolocate.geocoder.Geocoder.canonicalize, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Geocoder(3)

canonicalize(1)

geocode(1)

Häufig verwendete Methoden

Geocoder (3)

canonicalize (1)

geocode (1)

Beispiel #1

Datei anzeigen

Datei: method.py Projekt: ymt123/geoinference

class Wheres_Wally(GIMethod):
    def __init__(self):
        # Location is represented as a lat/lon geopy Point
        self.user_id_to_location = {}
        self.geocoder = None;
        self.unique_locations = set()
        self.id_to_location = {}

        # Mappings from feature names to their corresponding indices in a
        # feature vector
        self.pop_bin_feature_indices = {}
        self.reciprocal_feature_indices = {}
        self.triad_feature_indices = {}
        self.total_num_features = 0
        
        # The SVM classifier and feature vector scaler
        self.location_classifier = None
        self.location_vector_scaler = None



    def train_model(self, settings, dataset, model_dir):

        # Initialize the geocoder, which we'll use to resolve location strings.
        # We use the default name-to-location mapping unless the user has
        # specified otherwise.
        if 'location_source' in settings:
            self.geocoder = Geocoder(dataset=settings['location_source'])
        else:
            self.geocoder = Geocoder()


        # NOTE: The original paper used the directional friends/followers
        # network.  However, the paper was tested on a much smaller network
        # (9.8M edges), which doesn't scale when including the full network.  We
        # opt for using the bi-directional networks as these (1) provide a
        # stronger signal of social relationships and (2) significantly reduce
        # the memory requirement.
        LOGGER.debug('Loading mention network')        
        mention_network = dataset.bi_mention_network()

        # This dict will contain a mapping from user ID to an associated home
        # location, which is derived either from the location field (as in the
        # original paper), from GPS-tagged tweets, or from both
        user_to_home_loc = {}
        
        # For each of the users that we have in the network, see if we can
        # associate that user with a home location.
        all_users = set(mention_network.nodes_iter())
        
        LOGGER.debug('Calculating users with recognizable home location')
        num_users_processed = 0

        # Keep track of how many times each location occurred.  We'll filter
        # this down to only the most common locations
        location_counts = collections.Counter() 

        for user_id, home_loc in dataset.user_home_location_iter():
            
            if not user_id in all_users:
                continue
            
            # home_loc is a (lat,lon) tuple.  While this is accurate, we want to
            # coarsen the location data to decrease sparsity (i.e., more people
            # located in the same city location, despite slightly different
            # underlying lat/lon values).  Here, use the Geocoder to map the
            # lat/lon to a name and then back to a canonical lat/lon for that
            # name           
            canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1])

            location_counts[canonical_lat_lon] += 1

            user_to_home_loc[user_id] = canonical_lat_lon
            num_users_processed += 1
            if num_users_processed % 500000 == 0:
                LOGGER.debug('Processed %s of the %s users, associated %s a known location (%s)'
                             % (num_users_processed, len(all_users), len(user_to_home_loc),
                                len(user_to_home_loc) / float(num_users_processed)))

        # Iterate through the locations pruning out those that do not occur more
        # than some threshold number of times
        num_locs_removed = 0
        for lat_lon, count in location_counts.iteritems():
            if count >= 20:
                self.unique_locations.add(lat_lon)
            else:
                num_locs_removed += 1
        LOGGER.debug('Saw %d locations, %d with at least 5 users, %d to be pruned'
                     % (len(location_counts), len(self.unique_locations), num_locs_removed))


        # Remove the home locations of users whose locations aren't in the
        # pruned list of minimum-frequency locations
        num_user_home_locs_removed = 0
        for user_id, loc in user_to_home_loc.items():
            if not loc in self.unique_locations:
                del user_to_home_loc[user_id]
                num_user_home_locs_removed += 1
        LOGGER.debug('After pruning removed home locations of %d users, %d still have homes'
                     % (num_user_home_locs_removed, len(user_to_home_loc)))
                

        # Create a bi-directional mapping from locations to unique
        # numeric identifiers.  This mapping will be used when
        # representing locations in the classifier feature space and
        # when converting classifier output to specific locations
        location_to_id = {}
        for loc in self.unique_locations:
            id_ = len(location_to_id)
            location_to_id[loc] = id_
            self.id_to_location[id_] = loc

        # Associate each location with its set of features
        n = len(self.unique_locations)

        # Each location has 7 features associated with it for classifying a
        # user's location.  The seven features per location are arranged next to
        # each other in the feature space.
        feature_offset = 0
        for loc in self.unique_locations:
            # Feat1: it's population bin (size approx.)
            self.pop_bin_feature_indices[loc] = feature_offset
            # Feat2: the number of reciprocal friends
            self.reciprocal_feature_indices[loc] = feature_offset + 1
            # Feat3-7: the bins indicating how many friends were in reciprocal
            # triads in that city
            for bin_num in range(0, 5):
                feat = "%s,%s:%s" % (loc[0], loc[1], bin_num)
                self.triad_feature_indices[feat] = feature_offset + bin_num + 2
            # Increment the feature offset so the next city's features don't
            # collide with this city's indices 
            feature_offset += 7
        
        # Set the total number of features seen 
        self.total_num_features = feature_offset
        LOGGER.debug('Saw %d unique locations, %d total featurs' 
                     % (len(self.unique_locations), feature_offset))

        LOGGER.debug('Associated %s of the %s users with a known location (%s unique)'
                     % (len(user_to_home_loc), len(all_users), len(self.unique_locations)))

        # The list of locations for each corresponding user in X
        B = []
        
        # Train the classifier based on users with known home locations
        LOGGER.debug("Generating feature vectors for training")
        X = scipy.sparse.lil_matrix((len(user_to_home_loc), 
                                     self.total_num_features), dtype=numpy.float64)
	print X
        row = 0
        total_nz = 0
        for user_id, location in user_to_home_loc.iteritems():

            # Skip users whose locations were omitted due to frequency filtering
            # or who have home locations but are not in the mention network
            #if not location in self.unique_locations or not user_id in all_users:
            #    continue

            # Fill the row in the matrix corresponding to this user's features
            nz = self.fill_user_vector(user_id, mention_network,
                                       user_to_home_loc, X, row)
            total_nz += nz
            
            # Get the index of this user's location
            location_id = location_to_id[location]
            B.append(location_id)
            row += 1
        X = X.tocsr()
        #X = X.toarray()

        LOGGER.debug("Generated training data for %d users, %d nz features, %f on average"
                     % (row, total_nz, float(total_nz) / row))
        

        # Convert the location list into a numpy array for use with scikit
        Y = numpy.asarray(B)

        if len(X.nonzero()[0]) == 0:
            LOGGER.warning("Too little training data seen and no user had non-zero feature "+
                           "values.  Cowardly aborting classification")
        else:
            # Use SVM classifier with a linear kernel.
            #
            # NOTE NOTE NOTE NOTE
            #
            # The original paper uses an RBF kernel with their SVM.  However,
            # this proved impossibly slow during testing, so a linear kernel was
            # used instead.  
            #
            # NOTE NOTE NOTE NOTE
            #
            # slow: self.location_classifier = svm.SVC(kernel='rbf')
            #self.location_classifier = svm.LinearSVC(dual=False)
            #self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000)
            #self.location_classifier = naive_bayes.BernoulliNB()
            self.location_classifier = svm.LinearSVC(dual=False, loss='l2', penalty="l2",
                                                     tol=1e-2)

            # Note: we expect the vector representations to be sparse, so avoid mean
            # scaling since it would create dense vectors, which would blow up the
            # memory consumption of the model
            self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False)
            
            # Learn the scaling parameters and then rescale the input            
            LOGGER.debug("Scaling feature vectors for training")
            X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64))

            LOGGER.debug("Training classifier")
            self.location_classifier.fit(X_scaled, Y)
            LOGGER.debug("Finished training classifier")

            # Assign all the users some location, if we can figure it out
            users_assigned = 0
            users_seen = 0
            for user_id in all_users:
                users_seen += 1
                # If we know where to place this user, assign it to their home location
                if user_id in user_to_home_loc:
                    self.user_id_to_location[user_id] = user_to_home_loc[user_id]
                # Otherwise try to infer the location
                else:
                    location = self.infer_location(user_id, mention_network,
                                                   user_to_home_loc)
                    if not location is None:
                        self.user_id_to_location[user_id] = location
                        users_assigned += 1

                if users_seen % 100000 == 0:
                    LOGGER.debug((("Saw %d/%d users, knew location of %d, " +
                                   "inferred the location of %d (total: %d)")
                                  % (users_seen, len(all_users),
                                     len(self.user_id_to_location) - users_assigned,
                                     users_assigned,
                                     len(self.user_id_to_location))))

        LOGGER.debug((("Ultimately saw %d/%d users, knew location of %d, " +
                       "inferred the location of %d (total: %d)")
                      % (users_seen, len(all_users),
                         len(self.user_id_to_location) - users_assigned,
                         users_assigned,
                         len(self.user_id_to_location))))
                        

        # Short circuit early if the caller has specified that the model is not
        # to be saved into a directory
        if model_dir is None:
            return Wheres_Wally_Model(self.user_id_to_location)

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)         

        # Write the .tsv for human debugability too
        fh = gzip.open(os.path.join(model_dir, 'user-to-lat-lon.tsv.gz'), 'w')
        for user_id, loc in self.user_id_to_location.iteritems():
            fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1]));
        fh.close()

        return Wheres_Wally_Model(self.user_id_to_location)
               

    def infer_location(self, user_id, mention_network, user_to_home_loc):
        """
        Infers and returns the location of the provided users based on their
        features in the network
        """
        
        # Ensure that the model has been trained; otherwise, report an
        # empty classification
        if self.location_vector_scaler is None or self.location_classifier is None:
            return None

        # Convert the user's network-based features into a numeric vector
        X = scipy.sparse.lil_matrix((1, self.total_num_features), dtype=numpy.float64)
        self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, 0)
        X = X.tocsr()                

        # Rescale the vector according to the training data's scaling
        user_vector_scaled = self.location_vector_scaler.transform(X)
        
        # Classify the results
        location_id = self.location_classifier.predict(user_vector_scaled)[0]
                
        # Convert the index into a location
        return self.id_to_location[location_id]
        

    def fill_user_vector(self, user_id, mention_network, user_to_home_loc,
                         csr_matrix, row_to_fill):
        """         
        Creates a vector for the user and fills their data into the
        specified row in the provided matrix
        """
        feat_dict = self.create_user_vector(user_id, mention_network, 
                                            user_to_home_loc)
        nz = 0
        for col, val in feat_dict.iteritems():
            csr_matrix[row_to_fill, col] = val
            nz += 1
        return nz


    def create_user_vector(self, user_id, mention_network, user_to_home_loc):
        """
        Creates a vector to use with SciPy that represents this user's features
        """

        # The binned location features look at all the locations of this user's
        # neighbors and then provide a weight for each location according to how
        # many of the user's friends are in that location multiplied by how
        # large the city is, which is represented as one of five bins

        location_to_friends = defaultdict(list)
        location_to_followers = defaultdict(list)
        num_friends = mention_network.degree(user_id)

        # Record which friend appear in each city
        for neighbor_id in mention_network.neighbors_iter(user_id):
            if neighbor_id in user_to_home_loc:
                location_name = user_to_home_loc[neighbor_id]
                location_to_friends[location_name].append(neighbor_id)
		location_to_followers[location_name].append(neighbor_id)


        # Since the vector is expected to be very sparse, create it as a dict
        # for the indices with non-zero feature values.
        classifier_input_vector = {}
        num_non_zero_features = 0

        # Each city/location generates 7 unique features in the best performing
        # system
        for city, followers_in_city in location_to_followers.iteritems():
            n = len(followers_in_city)

            # Feature 1: the city's bin multiplied by the number of users in the
            # city
            city_bin = self.get_city_bin(n)
            pop_bin_feature_index = self.pop_bin_feature_indices[city]
            classifier_input_vector[pop_bin_feature_index] = city_bin

        for city, friends_in_city in location_to_friends.iteritems():
            n = len(friends_in_city)

            # Feature 2: the percentage of friends with reciprocal edges at that
            # location
            num_reciprocal_friends = 0
            for n1 in friends_in_city:
                if mention_network.has_edge(n1, user_id):
                    num_reciprocal_friends += 1
                    num_non_zero_features += 1
            reciprocal_feature_index = self.reciprocal_feature_indices[city]
            classifier_input_vector[reciprocal_feature_index] = num_reciprocal_friends / n
            if num_reciprocal_friends > 0:
                num_non_zero_features += 1
                    
            # Features 3-7: the number of triads in the city
            triad_counter = collections.Counter()
            for n1 in friends_in_city:
                num_triads = 0
                for n2 in friends_in_city:
                    if mention_network.has_edge(n1, n2):
                        num_triads += 1

                # Decide which bin this user is in
                triad_counter[self.get_triad_bin(num_triads)] += 1

            for bin_num, count in triad_counter.iteritems():
                feat = "%s,%s:%s" % (city[0], city[1], bin_num)
                triad_bin_feature_index = self.triad_feature_indices[feat]
                classifier_input_vector[triad_bin_feature_index] = count / num_friends
                if count > 0:
                    num_non_zero_features += 1

        return classifier_input_vector
                

    def get_triad_bin(self, num_triads):
        """
        Returns which bin this count of the number of triads should be in
        """
        # Bins in the paper [0,5,10,20,40]
        if num_triads < 5:
            return 0
        elif num_triads < 10:
            return 1
        elif num_triads < 20:
            return 2
        elif num_triads < 40:
            return 3
        else:
            return 4

    def get_city_bin(self, city_size):
        """
        Returns which bin this count of the number of triads should be in
        """
        # Bins in the paper [1,2,4,12,57054] 
        if city_size <= 1:
            return 0
        elif city_size <= 2:
            return 1
        elif city_size <= 4:
            return 2
        elif city_size <= 12:
            return 3
        # This sould be 57054, but we use any value larger than 12 to
        # avoid the edge case where a city has more than 57k users
        else: 
            return 4

    def load_model(self, model_dir, settings):
        """
        Reads in the Where's Wally model from a gzipped .tsv
        """      

        user_id_to_location = {}
        model_file = gzip.open(os.path.join(model_dir, "user-to-lat-lon.tsv.gz"), 'r')
        for line in model_file:
            cols = line.split("\t")
            user_id = cols[0]
            lat = float(cols[1])
            lon = float(cols[2])
            user_id_to_location[user_id] = (lat, lon)

        model_file.close()
        return Wheres_Wally_Model(user_id_to_location)

Beispiel #2

Datei anzeigen

Datei: method.py Projekt: ConnorMcMahon/geoinference

class Wheres_Wally(GIMethod):
    def __init__(self):
        # Location is represented as a lat/lon geopy Point
        self.user_id_to_location = {}
        self.geocoder = None
        self.unique_locations = set()
        self.id_to_location = {}

        # Mappings from feature names to their corresponding indices in a
        # feature vector
        self.pop_bin_feature_indices = {}
        self.reciprocal_feature_indices = {}
        self.triad_feature_indices = {}
        self.total_num_features = 0

        # The SVM classifier and feature vector scaler
        self.location_classifier = None
        self.location_vector_scaler = None

    def train_model(self, settings, dataset, model_dir):

        # Initialize the geocoder, which we'll use to resolve location strings.
        # We use the default name-to-location mapping unless the user has
        # specified otherwise.
        if "location_source" in settings:
            self.geocoder = Geocoder(dataset=settings["location_source"])
        else:
            self.geocoder = Geocoder()

        # NOTE: The original paper used the directional friends/followers
        # network.  However, the paper was tested on a much smaller network
        # (9.8M edges), which doesn't scale when including the full network.  We
        # opt for using the bi-directional networks as these (1) provide a
        # stronger signal of social relationships and (2) significantly reduce
        # the memory requirement.
        LOGGER.debug("Loading mention network")
        mention_network = dataset.bi_mention_network()

        # This dict will contain a mapping from user ID to an associated home
        # location, which is derived either from the location field (as in the
        # original paper), from GPS-tagged tweets, or from both
        user_to_home_loc = {}

        # For each of the users that we have in the network, see if we can
        # associate that user with a home location.
        all_users = set(mention_network.nodes_iter())

        LOGGER.debug("Calculating users with recognizable home location")
        num_users_processed = 0

        # Keep track of how many times each location occurred.  We'll filter
        # this down to only the most common locations
        location_counts = collections.Counter()

        for user_id, home_loc in dataset.user_home_location_iter():

            if not user_id in all_users:
                continue

            # home_loc is a (lat,lon) tuple.  While this is accurate, we want to
            # coarsen the location data to decrease sparsity (i.e., more people
            # located in the same city location, despite slightly different
            # underlying lat/lon values).  Here, use the Geocoder to map the
            # lat/lon to a name and then back to a canonical lat/lon for that
            # name
            canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1])

            location_counts[canonical_lat_lon] += 1

            user_to_home_loc[user_id] = canonical_lat_lon
            num_users_processed += 1
            if num_users_processed % 500000 == 0:
                LOGGER.debug(
                    "Processed %s of the %s users, associated %s a known location (%s)"
                    % (
                        num_users_processed,
                        len(all_users),
                        len(user_to_home_loc),
                        len(user_to_home_loc) / float(num_users_processed),
                    )
                )

        # Iterate through the locations pruning out those that do not occur more
        # than some threshold number of times
        num_locs_removed = 0
        for lat_lon, count in location_counts.iteritems():
            if count >= 20:
                self.unique_locations.add(lat_lon)
            else:
                num_locs_removed += 1
        LOGGER.debug(
            "Saw %d locations, %d with at least 5 users, %d to be pruned"
            % (len(location_counts), len(self.unique_locations), num_locs_removed)
        )

        # Remove the home locations of users whose locations aren't in the
        # pruned list of minimum-frequency locations
        num_user_home_locs_removed = 0
        for user_id, loc in user_to_home_loc.items():
            if not loc in self.unique_locations:
                del user_to_home_loc[user_id]
                num_user_home_locs_removed += 1
        LOGGER.debug(
            "After pruning removed home locations of %d users, %d still have homes"
            % (num_user_home_locs_removed, len(user_to_home_loc))
        )

        # Create a bi-directional mapping from locations to unique
        # numeric identifiers.  This mapping will be used when
        # representing locations in the classifier feature space and
        # when converting classifier output to specific locations
        location_to_id = {}
        for loc in self.unique_locations:
            id_ = len(location_to_id)
            location_to_id[loc] = id_
            self.id_to_location[id_] = loc

        # Associate each location with its set of features
        n = len(self.unique_locations)

        # Each location has 7 features associated with it for classifying a
        # user's location.  The seven features per location are arranged next to
        # each other in the feature space.
        feature_offset = 0
        for loc in self.unique_locations:
            # Feat1: it's population bin (size approx.)
            self.pop_bin_feature_indices[loc] = feature_offset
            # Feat2: the number of reciprocal friends
            self.reciprocal_feature_indices[loc] = feature_offset + 1
            # Feat3-7: the bins indicating how many friends were in reciprocal
            # triads in that city
            for bin_num in range(0, 5):
                feat = "%s,%s:%s" % (loc[0], loc[1], bin_num)
                self.triad_feature_indices[feat] = feature_offset + bin_num + 2
            # Increment the feature offset so the next city's features don't
            # collide with this city's indices
            feature_offset += 7

        # Set the total number of features seen
        self.total_num_features = feature_offset
        LOGGER.debug("Saw %d unique locations, %d total featurs" % (len(self.unique_locations), feature_offset))

        LOGGER.debug(
            "Associated %s of the %s users with a known location (%s unique)"
            % (len(user_to_home_loc), len(all_users), len(self.unique_locations))
        )

        # The list of locations for each corresponding user in X
        B = []

        # Train the classifier based on users with known home locations
        LOGGER.debug("Generating feature vectors for training")
        X = scipy.sparse.lil_matrix((len(user_to_home_loc), self.total_num_features), dtype=numpy.float64)
        print X
        row = 0
        total_nz = 0
        for user_id, location in user_to_home_loc.iteritems():

            # Skip users whose locations were omitted due to frequency filtering
            # or who have home locations but are not in the mention network
            # if not location in self.unique_locations or not user_id in all_users:
            #    continue

            # Fill the row in the matrix corresponding to this user's features
            nz = self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, row)
            total_nz += nz

            # Get the index of this user's location
            location_id = location_to_id[location]
            B.append(location_id)
            row += 1
        X = X.tocsr()
        # X = X.toarray()

        LOGGER.debug(
            "Generated training data for %d users, %d nz features, %f on average"
            % (row, total_nz, float(total_nz) / row)
        )

        # Convert the location list into a numpy array for use with scikit
        Y = numpy.asarray(B)

        if len(X.nonzero()[0]) == 0:
            LOGGER.warning(
                "Too little training data seen and no user had non-zero feature "
                + "values.  Cowardly aborting classification"
            )
        else:
            # Use SVM classifier with a linear kernel.
            #
            # NOTE NOTE NOTE NOTE
            #
            # The original paper uses an RBF kernel with their SVM.  However,
            # this proved impossibly slow during testing, so a linear kernel was
            # used instead.
            #
            # NOTE NOTE NOTE NOTE
            #
            # slow: self.location_classifier = svm.SVC(kernel='rbf')
            # self.location_classifier = svm.LinearSVC(dual=False)
            # self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000)
            # self.location_classifier = naive_bayes.BernoulliNB()
            self.location_classifier = svm.LinearSVC(dual=False, loss="l2", penalty="l2", tol=1e-2)

            # Note: we expect the vector representations to be sparse, so avoid mean
            # scaling since it would create dense vectors, which would blow up the
            # memory consumption of the model
            self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False)

            # Learn the scaling parameters and then rescale the input
            LOGGER.debug("Scaling feature vectors for training")
            X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64))

            LOGGER.debug("Training classifier")
            self.location_classifier.fit(X_scaled, Y)
            LOGGER.debug("Finished training classifier")

            # Assign all the users some location, if we can figure it out
            users_assigned = 0
            users_seen = 0
            for user_id in all_users:
                users_seen += 1
                # If we know where to place this user, assign it to their home location
                if user_id in user_to_home_loc:
                    self.user_id_to_location[user_id] = user_to_home_loc[user_id]
                # Otherwise try to infer the location
                else:
                    location = self.infer_location(user_id, mention_network, user_to_home_loc)
                    if not location is None:
                        self.user_id_to_location[user_id] = location
                        users_assigned += 1

                if users_seen % 100000 == 0:
                    LOGGER.debug(
                        (
                            ("Saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)")
                            % (
                                users_seen,
                                len(all_users),
                                len(self.user_id_to_location) - users_assigned,
                                users_assigned,
                                len(self.user_id_to_location),
                            )
                        )
                    )

        LOGGER.debug(
            (
                ("Ultimately saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)")
                % (
                    users_seen,
                    len(all_users),
                    len(self.user_id_to_location) - users_assigned,
                    users_assigned,
                    len(self.user_id_to_location),
                )
            )
        )

        # Short circuit early if the caller has specified that the model is not
        # to be saved into a directory
        if model_dir is None:
            return Wheres_Wally_Model(self.user_id_to_location)

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)

        # Write the .tsv for human debugability too
        fh = gzip.open(os.path.join(model_dir, "user-to-lat-lon.tsv.gz"), "w")
        for user_id, loc in self.user_id_to_location.iteritems():
            fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1]))
        fh.close()

        return Wheres_Wally_Model(self.user_id_to_location)

    def infer_location(self, user_id, mention_network, user_to_home_loc):
        """
        Infers and returns the location of the provided users based on their
        features in the network
        """

        # Ensure that the model has been trained; otherwise, report an
        # empty classification
        if self.location_vector_scaler is None or self.location_classifier is None:
            return None

        # Convert the user's network-based features into a numeric vector
        X = scipy.sparse.lil_matrix((1, self.total_num_features), dtype=numpy.float64)
        self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, 0)
        X = X.tocsr()

        # Rescale the vector according to the training data's scaling
        user_vector_scaled = self.location_vector_scaler.transform(X)

        # Classify the results
        location_id = self.location_classifier.predict(user_vector_scaled)[0]

        # Convert the index into a location
        return self.id_to_location[location_id]

    def fill_user_vector(self, user_id, mention_network, user_to_home_loc, csr_matrix, row_to_fill):
        """         
        Creates a vector for the user and fills their data into the
        specified row in the provided matrix
        """
        feat_dict = self.create_user_vector(user_id, mention_network, user_to_home_loc)
        nz = 0
        for col, val in feat_dict.iteritems():
            csr_matrix[row_to_fill, col] = val
            nz += 1
        return nz

    def create_user_vector(self, user_id, mention_network, user_to_home_loc):
        """
        Creates a vector to use with SciPy that represents this user's features
        """

        # The binned location features look at all the locations of this user's
        # neighbors and then provide a weight for each location according to how
        # many of the user's friends are in that location multiplied by how
        # large the city is, which is represented as one of five bins

        location_to_friends = defaultdict(list)
        location_to_followers = defaultdict(list)
        num_friends = mention_network.degree(user_id)

        # Record which friend appear in each city
        for neighbor_id in mention_network.neighbors_iter(user_id):
            if neighbor_id in user_to_home_loc:
                location_name = user_to_home_loc[neighbor_id]
                location_to_friends[location_name].append(neighbor_id)
                location_to_followers[location_name].append(neighbor_id)

        # Since the vector is expected to be very sparse, create it as a dict
        # for the indices with non-zero feature values.
        classifier_input_vector = {}
        num_non_zero_features = 0

        # Each city/location generates 7 unique features in the best performing
        # system
        for city, followers_in_city in location_to_followers.iteritems():
            n = len(followers_in_city)

            # Feature 1: the city's bin multiplied by the number of users in the
            # city
            city_bin = self.get_city_bin(n)
            pop_bin_feature_index = self.pop_bin_feature_indices[city]
            classifier_input_vector[pop_bin_feature_index] = city_bin

        for city, friends_in_city in location_to_friends.iteritems():
            n = len(friends_in_city)

            # Feature 2: the percentage of friends with reciprocal edges at that
            # location
            num_reciprocal_friends = 0
            for n1 in friends_in_city:
                if mention_network.has_edge(n1, user_id):
                    num_reciprocal_friends += 1
                    num_non_zero_features += 1
            reciprocal_feature_index = self.reciprocal_feature_indices[city]
            classifier_input_vector[reciprocal_feature_index] = num_reciprocal_friends / n
            if num_reciprocal_friends > 0:
                num_non_zero_features += 1

            # Features 3-7: the number of triads in the city
            triad_counter = collections.Counter()
            for n1 in friends_in_city:
                num_triads = 0
                for n2 in friends_in_city:
                    if mention_network.has_edge(n1, n2):
                        num_triads += 1

                # Decide which bin this user is in
                triad_counter[self.get_triad_bin(num_triads)] += 1

            for bin_num, count in triad_counter.iteritems():
                feat = "%s,%s:%s" % (city[0], city[1], bin_num)
                triad_bin_feature_index = self.triad_feature_indices[feat]
                classifier_input_vector[triad_bin_feature_index] = count / num_friends
                if count > 0:
                    num_non_zero_features += 1

        return classifier_input_vector

    def get_triad_bin(self, num_triads):
        """
        Returns which bin this count of the number of triads should be in
        """
        # Bins in the paper [0,5,10,20,40]
        if num_triads < 5:
            return 0
        elif num_triads < 10:
            return 1
        elif num_triads < 20:
            return 2
        elif num_triads < 40:
            return 3
        else:
            return 4

    def get_city_bin(self, city_size):
        """
        Returns which bin this count of the number of triads should be in
        """
        # Bins in the paper [1,2,4,12,57054]
        if city_size <= 1:
            return 0
        elif city_size <= 2:
            return 1
        elif city_size <= 4:
            return 2
        elif city_size <= 12:
            return 3
        # This sould be 57054, but we use any value larger than 12 to
        # avoid the edge case where a city has more than 57k users
        else:
            return 4

    def load_model(self, model_dir, settings):
        """
        Reads in the Where's Wally model from a gzipped .tsv
        """

        user_id_to_location = {}
        model_file = gzip.open(os.path.join(model_dir, "user-to-lat-lon.tsv.gz"), "r")
        for line in model_file:
            cols = line.split("\t")
            user_id = cols[0]
            lat = float(cols[1])
            lon = float(cols[2])
            user_id_to_location[user_id] = (lat, lon)

        model_file.close()
        return Wheres_Wally_Model(user_id_to_location)