Python Geocoder.Geocoder Beispiele

Programmiersprache: Python

Namespace / Paketname: geolocate.geocoder

Klasse / Typ: Geocoder

Methode / Funktion: Geocoder

Beispiele auf hotexamples.com: 3

Python Geocoder.Geocoder - 3 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die geolocate.geocoder.Geocoder.Geocoder, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Geocoder(3)

canonicalize(1)

geocode(1)

Häufig verwendete Methoden

Geocoder (3)

canonicalize (1)

geocode (1)

Beispiel #1

Datei anzeigen

    def __init__(self, settings):
        """
        Initializing class variables.
        """
        # the mention network that will store inferred locations in node_data
        self.mention_network = MultiLocationMethod.dataset.bi_mention_network()
        self.nodes = set(self.mention_network.nodes())

        #self.u_n, self.u_star are the sets of users with unknown and known locations respectively.
        self.u_n = set()
        self.u_star = set()

        #the set of all known venues
        self.venues = set()

        #list of all locations, and the co-occurences with a user.
        self.psi = Counter()

        #alpha and beta are the coefficients for eq.1 as per the paper
        self.alpha = -0.55
        self.beta = 0.0045

        #K is the total number of tweeting relationships
        self.K = 0

        #N_squared is the total number of user pairs
        self.N_squared = 0

        #S is the number of following relationships
        self.S = 0

        #geocoder is a forward/reverse geocoder for location -> lat/long and lat/lon -> location.
        if 'location_source' in settings:
            self.geocoder = Geocoder(dataset=settings['location_source'])
        else:
            self.geocoder = Geocoder()

        #F_r is the random following model Bernoulli distribution parameter
        self.F_r = None

        #T_r is the random tweeting model Bernoulli distribution parameter
        self.T_r = Counter()

        #mu and nu are the model selectors according to a bernoulli distribution
        self.mu = defaultdict(bool)
        self.nu = defaultdict(bool)

        #the multi-location list generated by the MLP
        self.user_multi_locations = defaultdict(list)

        #runs the model, populates all the variables and generates user_multi_locations
        self.run_model()

Beispiel #2

Datei anzeigen

Datei: method.py Projekt: ymt123/geoinference

    def train_model(self, settings, dataset, model_dir):

        # Initialize the geocoder, which we'll use to resolve location strings.
        # We use the default name-to-location mapping unless the user has
        # specified otherwise.
        if 'location_source' in settings:
            self.geocoder = Geocoder(dataset=settings['location_source'])
        else:
            self.geocoder = Geocoder()


        # NOTE: The original paper used the directional friends/followers
        # network.  However, the paper was tested on a much smaller network
        # (9.8M edges), which doesn't scale when including the full network.  We
        # opt for using the bi-directional networks as these (1) provide a
        # stronger signal of social relationships and (2) significantly reduce
        # the memory requirement.
        LOGGER.debug('Loading mention network')        
        mention_network = dataset.bi_mention_network()

        # This dict will contain a mapping from user ID to an associated home
        # location, which is derived either from the location field (as in the
        # original paper), from GPS-tagged tweets, or from both
        user_to_home_loc = {}
        
        # For each of the users that we have in the network, see if we can
        # associate that user with a home location.
        all_users = set(mention_network.nodes_iter())
        
        LOGGER.debug('Calculating users with recognizable home location')
        num_users_processed = 0

        # Keep track of how many times each location occurred.  We'll filter
        # this down to only the most common locations
        location_counts = collections.Counter() 

        for user_id, home_loc in dataset.user_home_location_iter():
            
            if not user_id in all_users:
                continue
            
            # home_loc is a (lat,lon) tuple.  While this is accurate, we want to
            # coarsen the location data to decrease sparsity (i.e., more people
            # located in the same city location, despite slightly different
            # underlying lat/lon values).  Here, use the Geocoder to map the
            # lat/lon to a name and then back to a canonical lat/lon for that
            # name           
            canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1])

            location_counts[canonical_lat_lon] += 1

            user_to_home_loc[user_id] = canonical_lat_lon
            num_users_processed += 1
            if num_users_processed % 500000 == 0:
                LOGGER.debug('Processed %s of the %s users, associated %s a known location (%s)'
                             % (num_users_processed, len(all_users), len(user_to_home_loc),
                                len(user_to_home_loc) / float(num_users_processed)))

        # Iterate through the locations pruning out those that do not occur more
        # than some threshold number of times
        num_locs_removed = 0
        for lat_lon, count in location_counts.iteritems():
            if count >= 20:
                self.unique_locations.add(lat_lon)
            else:
                num_locs_removed += 1
        LOGGER.debug('Saw %d locations, %d with at least 5 users, %d to be pruned'
                     % (len(location_counts), len(self.unique_locations), num_locs_removed))


        # Remove the home locations of users whose locations aren't in the
        # pruned list of minimum-frequency locations
        num_user_home_locs_removed = 0
        for user_id, loc in user_to_home_loc.items():
            if not loc in self.unique_locations:
                del user_to_home_loc[user_id]
                num_user_home_locs_removed += 1
        LOGGER.debug('After pruning removed home locations of %d users, %d still have homes'
                     % (num_user_home_locs_removed, len(user_to_home_loc)))
                

        # Create a bi-directional mapping from locations to unique
        # numeric identifiers.  This mapping will be used when
        # representing locations in the classifier feature space and
        # when converting classifier output to specific locations
        location_to_id = {}
        for loc in self.unique_locations:
            id_ = len(location_to_id)
            location_to_id[loc] = id_
            self.id_to_location[id_] = loc

        # Associate each location with its set of features
        n = len(self.unique_locations)

        # Each location has 7 features associated with it for classifying a
        # user's location.  The seven features per location are arranged next to
        # each other in the feature space.
        feature_offset = 0
        for loc in self.unique_locations:
            # Feat1: it's population bin (size approx.)
            self.pop_bin_feature_indices[loc] = feature_offset
            # Feat2: the number of reciprocal friends
            self.reciprocal_feature_indices[loc] = feature_offset + 1
            # Feat3-7: the bins indicating how many friends were in reciprocal
            # triads in that city
            for bin_num in range(0, 5):
                feat = "%s,%s:%s" % (loc[0], loc[1], bin_num)
                self.triad_feature_indices[feat] = feature_offset + bin_num + 2
            # Increment the feature offset so the next city's features don't
            # collide with this city's indices 
            feature_offset += 7
        
        # Set the total number of features seen 
        self.total_num_features = feature_offset
        LOGGER.debug('Saw %d unique locations, %d total featurs' 
                     % (len(self.unique_locations), feature_offset))

        LOGGER.debug('Associated %s of the %s users with a known location (%s unique)'
                     % (len(user_to_home_loc), len(all_users), len(self.unique_locations)))

        # The list of locations for each corresponding user in X
        B = []
        
        # Train the classifier based on users with known home locations
        LOGGER.debug("Generating feature vectors for training")
        X = scipy.sparse.lil_matrix((len(user_to_home_loc), 
                                     self.total_num_features), dtype=numpy.float64)
	print X
        row = 0
        total_nz = 0
        for user_id, location in user_to_home_loc.iteritems():

            # Skip users whose locations were omitted due to frequency filtering
            # or who have home locations but are not in the mention network
            #if not location in self.unique_locations or not user_id in all_users:
            #    continue

            # Fill the row in the matrix corresponding to this user's features
            nz = self.fill_user_vector(user_id, mention_network,
                                       user_to_home_loc, X, row)
            total_nz += nz
            
            # Get the index of this user's location
            location_id = location_to_id[location]
            B.append(location_id)
            row += 1
        X = X.tocsr()
        #X = X.toarray()

        LOGGER.debug("Generated training data for %d users, %d nz features, %f on average"
                     % (row, total_nz, float(total_nz) / row))
        

        # Convert the location list into a numpy array for use with scikit
        Y = numpy.asarray(B)

        if len(X.nonzero()[0]) == 0:
            LOGGER.warning("Too little training data seen and no user had non-zero feature "+
                           "values.  Cowardly aborting classification")
        else:
            # Use SVM classifier with a linear kernel.
            #
            # NOTE NOTE NOTE NOTE
            #
            # The original paper uses an RBF kernel with their SVM.  However,
            # this proved impossibly slow during testing, so a linear kernel was
            # used instead.  
            #
            # NOTE NOTE NOTE NOTE
            #
            # slow: self.location_classifier = svm.SVC(kernel='rbf')
            #self.location_classifier = svm.LinearSVC(dual=False)
            #self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000)
            #self.location_classifier = naive_bayes.BernoulliNB()
            self.location_classifier = svm.LinearSVC(dual=False, loss='l2', penalty="l2",
                                                     tol=1e-2)

            # Note: we expect the vector representations to be sparse, so avoid mean
            # scaling since it would create dense vectors, which would blow up the
            # memory consumption of the model
            self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False)
            
            # Learn the scaling parameters and then rescale the input            
            LOGGER.debug("Scaling feature vectors for training")
            X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64))

            LOGGER.debug("Training classifier")
            self.location_classifier.fit(X_scaled, Y)
            LOGGER.debug("Finished training classifier")

            # Assign all the users some location, if we can figure it out
            users_assigned = 0
            users_seen = 0
            for user_id in all_users:
                users_seen += 1
                # If we know where to place this user, assign it to their home location
                if user_id in user_to_home_loc:
                    self.user_id_to_location[user_id] = user_to_home_loc[user_id]
                # Otherwise try to infer the location
                else:
                    location = self.infer_location(user_id, mention_network,
                                                   user_to_home_loc)
                    if not location is None:
                        self.user_id_to_location[user_id] = location
                        users_assigned += 1

                if users_seen % 100000 == 0:
                    LOGGER.debug((("Saw %d/%d users, knew location of %d, " +
                                   "inferred the location of %d (total: %d)")
                                  % (users_seen, len(all_users),
                                     len(self.user_id_to_location) - users_assigned,
                                     users_assigned,
                                     len(self.user_id_to_location))))

        LOGGER.debug((("Ultimately saw %d/%d users, knew location of %d, " +
                       "inferred the location of %d (total: %d)")
                      % (users_seen, len(all_users),
                         len(self.user_id_to_location) - users_assigned,
                         users_assigned,
                         len(self.user_id_to_location))))
                        

        # Short circuit early if the caller has specified that the model is not
        # to be saved into a directory
        if model_dir is None:
            return Wheres_Wally_Model(self.user_id_to_location)

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)         

        # Write the .tsv for human debugability too
        fh = gzip.open(os.path.join(model_dir, 'user-to-lat-lon.tsv.gz'), 'w')
        for user_id, loc in self.user_id_to_location.iteritems():
            fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1]));
        fh.close()

        return Wheres_Wally_Model(self.user_id_to_location)

Beispiel #3

Datei anzeigen

Datei: method.py Projekt: ymt123/geoinference

    def train_model(self, settings, dataset, model_dir=None):
        # settings in the form
        '''{ 'LCR_min_dist' : the cutoff distance to distinguish between local and non-local contacts (default = 40 km ~ 25 miles)
			 'qntl_num' : the number of quantiles (default is 10)
			 'min_geotag' : the minimum number of geotags that makes a user a target (deafault = 3)
			 'min_samples_leaf' : the maximum number of sample in a leaf of the regression tree, i.e: the minimum for the regressor to not split a leaf (default = 1000)
		   }'''
        self.min_dist = settings.pop('LCR_min_dist', 40)
        self.m = settings.pop('qntl_num', 10)
        self.min_geotag = settings.pop('min_geotag', 3)
        min_samp_leaf = settings.pop('min_samples_leaf', 1000)
        LOGGER.debug('tree')
        self.tree = DecisionTreeRegressor(
            min_samples_leaf=min_samp_leaf)  # the classifier
        LOGGER.debug('geocoder')
        #LocRes = Geocoder()
        if 'location_source' in settings:
            LocRes = Geocoder(dataset=settings['location_source'])
        else:
            LocRes = Geocoder()

        LOGGER.debug('loading mention network')
        self.X = dataset.mention_network(bidirectional=True,
                                         directed=True,
                                         weighted=True)
        #print len(self.X)
        #counter = set_counter('has at least %d geotags'%self.min_geotag) ### counter
        # adding users
        self.user_to_home_loc = {
            user: loc
            for (user, loc) in dataset.user_home_location_iter()
        }
        user_loc_list = self.user_to_home_loc.items()
        random.shuffle(user_loc_list)
        #print len(user_loc_list)
        #Take a sample from user home locations to estimate stgrEdges and actEdges
        start = time.time()
        user_loc_list = user_loc_list[:50000]
        #print 'home loc time:'
        #print len(user_loc_list)
        #fstgr = open('stgr_edges.tsv', 'w')
        c = 0
        LOGGER.debug('sampling stranger edges and actual edges')
        for uid1, loc1 in user_loc_list:
            #if c % 100 == 0:
            #	print c
            c2 = 0
            for uid2, loc2 in user_loc_list:
                if not c2 == c:
                    if self.X.has_edge(uid1, uid2):
                        self.actEdgesTuples.append((uid1, uid2))
                    distance = round(utils.distance(loc1, loc2), 1)
                    self.stgrEdges[distance] += 1
                c2 += 1
            c += 1
        #for distance in self.stgrEdges:
        #	fstgr.write(str(distance) + '\t' + str(self.stgrEdges[distance]) + '\n')
        #fstgr.close()
        #print len(self.actEdgesTuples)
        LOGGER.debug('filling network')
        for _id, loc in dataset.user_home_location_iter():
            #_id = user['user_id']
            #loc = UserProfilingMethod.dataset.user_home_location_iter()
            #loc, pd = utils.get_post_data(user['posts'])
            #l_a = utils.is_geocoded(user, self.min_geotag)
            #counter.update(loc) ### counter
            #if not self.X.__contains__(_id):
            #self.X.add_node(_id)
            if loc[0] == 0 and loc[1] == 0:
                continue
            else:
                try:
                    self.X.add_node(_id)
                except:
                    pass
            l_a = loc
            #if not l_a:	continue
            self.add_user_data(_id, l_a, {})
            le = utils.location_error(l_a, loc, LocRes)
            self.set_loc_err(_id, le)

            # remove mentions of itself
            if self.X.has_edge(_id, _id):
                self.X.rm_edge(_id, _id)

        LOGGER.debug(str(self.X.__len__()) + 'users')
        LOGGER.debug(str(self.X.size()) + 'edges')

        self.set_d_a_for_all()

        tempx = []
        tempy = []
        for u, x in self.iter_contacts():
            tempx.append(self.get_contact_vector(u, x))
            tempy.append(self.get_d_a(u, x))
        X = np.array(tempx)
        Y = np.array(tempy)
        #X = np.array([self.get_contact_vector(u,x)
        #				for u, x in self.iter_contacts()])
        #Y = np.array([self.get_d_a(u,x)
        #				for u, x in self.iter_contacts()])

        LOGGER.debug('number of relationships' + str(len(X)))

        LOGGER.debug("fitting")
        start = timeit.default_timer()
        #try:
        self.fit(X, Y)
        #except:
        #	raise RuntimeError, 'No connections to train on.'

        LOGGER.debug('done fitting tree -' +
                     str(timeit.default_timer() - start) + 'sec')

        start = timeit.default_timer()
        self.quantile_boundaries(X)
        LOGGER.debug('done setting quantile boundaries -' +
                     str(timeit.default_timer() - start) + 'sec')

        start = timeit.default_timer()
        self.fit_curves(Y)
        LOGGER.debug('done fitting curves -' +
                     str(timeit.default_timer() - start) + 'sec')

        #self.model.allActEdges = self.allActEdges
        #self.model.stgrEdges = self.stgrEdges

        self.user_to_loc = self.infer_locs()
        if model_dir is not None:
            LOGGER.debug('saving model')
            filename = os.path.join(model_dir, "user-to-lat-lon.tsv.gz")
            fh = gzip.open(filename, 'w')
            for user_id, loc in self.user_to_loc.iteritems():
                if not loc is None:
                    fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1]))
            fh.close()

        self.model = FriendlyLocation_Model(self.user_to_loc)
        return self.model