Beispiel #1
0
    def __init__(self, settings):
        """
        Initializing class variables.
        """
        # the mention network that will store inferred locations in node_data
        self.mention_network = MultiLocationMethod.dataset.bi_mention_network()
        self.nodes = set(self.mention_network.nodes())

        #self.u_n, self.u_star are the sets of users with unknown and known locations respectively.
        self.u_n = set()
        self.u_star = set()

        #the set of all known venues
        self.venues = set()

        #list of all locations, and the co-occurences with a user.
        self.psi = Counter()

        #alpha and beta are the coefficients for eq.1 as per the paper
        self.alpha = -0.55
        self.beta = 0.0045

        #K is the total number of tweeting relationships
        self.K = 0

        #N_squared is the total number of user pairs
        self.N_squared = 0

        #S is the number of following relationships
        self.S = 0

        #geocoder is a forward/reverse geocoder for location -> lat/long and lat/lon -> location.
        if 'location_source' in settings:
            self.geocoder = Geocoder(dataset=settings['location_source'])
        else:
            self.geocoder = Geocoder()

        #F_r is the random following model Bernoulli distribution parameter
        self.F_r = None

        #T_r is the random tweeting model Bernoulli distribution parameter
        self.T_r = Counter()

        #mu and nu are the model selectors according to a bernoulli distribution
        self.mu = defaultdict(bool)
        self.nu = defaultdict(bool)

        #the multi-location list generated by the MLP
        self.user_multi_locations = defaultdict(list)

        #runs the model, populates all the variables and generates user_multi_locations
        self.run_model()
Beispiel #2
0
    def train_model(self, settings, dataset, model_dir):

        # Initialize the geocoder, which we'll use to resolve location strings.
        # We use the default name-to-location mapping unless the user has
        # specified otherwise.
        if 'location_source' in settings:
            self.geocoder = Geocoder(dataset=settings['location_source'])
        else:
            self.geocoder = Geocoder()


        # NOTE: The original paper used the directional friends/followers
        # network.  However, the paper was tested on a much smaller network
        # (9.8M edges), which doesn't scale when including the full network.  We
        # opt for using the bi-directional networks as these (1) provide a
        # stronger signal of social relationships and (2) significantly reduce
        # the memory requirement.
        LOGGER.debug('Loading mention network')        
        mention_network = dataset.bi_mention_network()

        # This dict will contain a mapping from user ID to an associated home
        # location, which is derived either from the location field (as in the
        # original paper), from GPS-tagged tweets, or from both
        user_to_home_loc = {}
        
        # For each of the users that we have in the network, see if we can
        # associate that user with a home location.
        all_users = set(mention_network.nodes_iter())
        
        LOGGER.debug('Calculating users with recognizable home location')
        num_users_processed = 0

        # Keep track of how many times each location occurred.  We'll filter
        # this down to only the most common locations
        location_counts = collections.Counter() 

        for user_id, home_loc in dataset.user_home_location_iter():
            
            if not user_id in all_users:
                continue
            
            # home_loc is a (lat,lon) tuple.  While this is accurate, we want to
            # coarsen the location data to decrease sparsity (i.e., more people
            # located in the same city location, despite slightly different
            # underlying lat/lon values).  Here, use the Geocoder to map the
            # lat/lon to a name and then back to a canonical lat/lon for that
            # name           
            canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1])

            location_counts[canonical_lat_lon] += 1

            user_to_home_loc[user_id] = canonical_lat_lon
            num_users_processed += 1
            if num_users_processed % 500000 == 0:
                LOGGER.debug('Processed %s of the %s users, associated %s a known location (%s)'
                             % (num_users_processed, len(all_users), len(user_to_home_loc),
                                len(user_to_home_loc) / float(num_users_processed)))

        # Iterate through the locations pruning out those that do not occur more
        # than some threshold number of times
        num_locs_removed = 0
        for lat_lon, count in location_counts.iteritems():
            if count >= 20:
                self.unique_locations.add(lat_lon)
            else:
                num_locs_removed += 1
        LOGGER.debug('Saw %d locations, %d with at least 5 users, %d to be pruned'
                     % (len(location_counts), len(self.unique_locations), num_locs_removed))


        # Remove the home locations of users whose locations aren't in the
        # pruned list of minimum-frequency locations
        num_user_home_locs_removed = 0
        for user_id, loc in user_to_home_loc.items():
            if not loc in self.unique_locations:
                del user_to_home_loc[user_id]
                num_user_home_locs_removed += 1
        LOGGER.debug('After pruning removed home locations of %d users, %d still have homes'
                     % (num_user_home_locs_removed, len(user_to_home_loc)))
                

        # Create a bi-directional mapping from locations to unique
        # numeric identifiers.  This mapping will be used when
        # representing locations in the classifier feature space and
        # when converting classifier output to specific locations
        location_to_id = {}
        for loc in self.unique_locations:
            id_ = len(location_to_id)
            location_to_id[loc] = id_
            self.id_to_location[id_] = loc

        # Associate each location with its set of features
        n = len(self.unique_locations)

        # Each location has 7 features associated with it for classifying a
        # user's location.  The seven features per location are arranged next to
        # each other in the feature space.
        feature_offset = 0
        for loc in self.unique_locations:
            # Feat1: it's population bin (size approx.)
            self.pop_bin_feature_indices[loc] = feature_offset
            # Feat2: the number of reciprocal friends
            self.reciprocal_feature_indices[loc] = feature_offset + 1
            # Feat3-7: the bins indicating how many friends were in reciprocal
            # triads in that city
            for bin_num in range(0, 5):
                feat = "%s,%s:%s" % (loc[0], loc[1], bin_num)
                self.triad_feature_indices[feat] = feature_offset + bin_num + 2
            # Increment the feature offset so the next city's features don't
            # collide with this city's indices 
            feature_offset += 7
        
        # Set the total number of features seen 
        self.total_num_features = feature_offset
        LOGGER.debug('Saw %d unique locations, %d total featurs' 
                     % (len(self.unique_locations), feature_offset))

        LOGGER.debug('Associated %s of the %s users with a known location (%s unique)'
                     % (len(user_to_home_loc), len(all_users), len(self.unique_locations)))

        # The list of locations for each corresponding user in X
        B = []
        
        # Train the classifier based on users with known home locations
        LOGGER.debug("Generating feature vectors for training")
        X = scipy.sparse.lil_matrix((len(user_to_home_loc), 
                                     self.total_num_features), dtype=numpy.float64)
	print X
        row = 0
        total_nz = 0
        for user_id, location in user_to_home_loc.iteritems():

            # Skip users whose locations were omitted due to frequency filtering
            # or who have home locations but are not in the mention network
            #if not location in self.unique_locations or not user_id in all_users:
            #    continue

            # Fill the row in the matrix corresponding to this user's features
            nz = self.fill_user_vector(user_id, mention_network,
                                       user_to_home_loc, X, row)
            total_nz += nz
            
            # Get the index of this user's location
            location_id = location_to_id[location]
            B.append(location_id)
            row += 1
        X = X.tocsr()
        #X = X.toarray()

        LOGGER.debug("Generated training data for %d users, %d nz features, %f on average"
                     % (row, total_nz, float(total_nz) / row))
        

        # Convert the location list into a numpy array for use with scikit
        Y = numpy.asarray(B)

        if len(X.nonzero()[0]) == 0:
            LOGGER.warning("Too little training data seen and no user had non-zero feature "+
                           "values.  Cowardly aborting classification")
        else:
            # Use SVM classifier with a linear kernel.
            #
            # NOTE NOTE NOTE NOTE
            #
            # The original paper uses an RBF kernel with their SVM.  However,
            # this proved impossibly slow during testing, so a linear kernel was
            # used instead.  
            #
            # NOTE NOTE NOTE NOTE
            #
            # slow: self.location_classifier = svm.SVC(kernel='rbf')
            #self.location_classifier = svm.LinearSVC(dual=False)
            #self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000)
            #self.location_classifier = naive_bayes.BernoulliNB()
            self.location_classifier = svm.LinearSVC(dual=False, loss='l2', penalty="l2",
                                                     tol=1e-2)

            # Note: we expect the vector representations to be sparse, so avoid mean
            # scaling since it would create dense vectors, which would blow up the
            # memory consumption of the model
            self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False)
            
            # Learn the scaling parameters and then rescale the input            
            LOGGER.debug("Scaling feature vectors for training")
            X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64))

            LOGGER.debug("Training classifier")
            self.location_classifier.fit(X_scaled, Y)
            LOGGER.debug("Finished training classifier")

            # Assign all the users some location, if we can figure it out
            users_assigned = 0
            users_seen = 0
            for user_id in all_users:
                users_seen += 1
                # If we know where to place this user, assign it to their home location
                if user_id in user_to_home_loc:
                    self.user_id_to_location[user_id] = user_to_home_loc[user_id]
                # Otherwise try to infer the location
                else:
                    location = self.infer_location(user_id, mention_network,
                                                   user_to_home_loc)
                    if not location is None:
                        self.user_id_to_location[user_id] = location
                        users_assigned += 1

                if users_seen % 100000 == 0:
                    LOGGER.debug((("Saw %d/%d users, knew location of %d, " +
                                   "inferred the location of %d (total: %d)")
                                  % (users_seen, len(all_users),
                                     len(self.user_id_to_location) - users_assigned,
                                     users_assigned,
                                     len(self.user_id_to_location))))

        LOGGER.debug((("Ultimately saw %d/%d users, knew location of %d, " +
                       "inferred the location of %d (total: %d)")
                      % (users_seen, len(all_users),
                         len(self.user_id_to_location) - users_assigned,
                         users_assigned,
                         len(self.user_id_to_location))))
                        

        # Short circuit early if the caller has specified that the model is not
        # to be saved into a directory
        if model_dir is None:
            return Wheres_Wally_Model(self.user_id_to_location)

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)         

        # Write the .tsv for human debugability too
        fh = gzip.open(os.path.join(model_dir, 'user-to-lat-lon.tsv.gz'), 'w')
        for user_id, loc in self.user_id_to_location.iteritems():
            fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1]));
        fh.close()

        return Wheres_Wally_Model(self.user_id_to_location)
Beispiel #3
0
    def train_model(self, settings, dataset, model_dir=None):
        # settings in the form
        '''{ 'LCR_min_dist' : the cutoff distance to distinguish between local and non-local contacts (default = 40 km ~ 25 miles)
			 'qntl_num' : the number of quantiles (default is 10)
			 'min_geotag' : the minimum number of geotags that makes a user a target (deafault = 3)
			 'min_samples_leaf' : the maximum number of sample in a leaf of the regression tree, i.e: the minimum for the regressor to not split a leaf (default = 1000)
		   }'''
        self.min_dist = settings.pop('LCR_min_dist', 40)
        self.m = settings.pop('qntl_num', 10)
        self.min_geotag = settings.pop('min_geotag', 3)
        min_samp_leaf = settings.pop('min_samples_leaf', 1000)
        LOGGER.debug('tree')
        self.tree = DecisionTreeRegressor(
            min_samples_leaf=min_samp_leaf)  # the classifier
        LOGGER.debug('geocoder')
        #LocRes = Geocoder()
        if 'location_source' in settings:
            LocRes = Geocoder(dataset=settings['location_source'])
        else:
            LocRes = Geocoder()

        LOGGER.debug('loading mention network')
        self.X = dataset.mention_network(bidirectional=True,
                                         directed=True,
                                         weighted=True)
        #print len(self.X)
        #counter = set_counter('has at least %d geotags'%self.min_geotag) ### counter
        # adding users
        self.user_to_home_loc = {
            user: loc
            for (user, loc) in dataset.user_home_location_iter()
        }
        user_loc_list = self.user_to_home_loc.items()
        random.shuffle(user_loc_list)
        #print len(user_loc_list)
        #Take a sample from user home locations to estimate stgrEdges and actEdges
        start = time.time()
        user_loc_list = user_loc_list[:50000]
        #print 'home loc time:'
        #print len(user_loc_list)
        #fstgr = open('stgr_edges.tsv', 'w')
        c = 0
        LOGGER.debug('sampling stranger edges and actual edges')
        for uid1, loc1 in user_loc_list:
            #if c % 100 == 0:
            #	print c
            c2 = 0
            for uid2, loc2 in user_loc_list:
                if not c2 == c:
                    if self.X.has_edge(uid1, uid2):
                        self.actEdgesTuples.append((uid1, uid2))
                    distance = round(utils.distance(loc1, loc2), 1)
                    self.stgrEdges[distance] += 1
                c2 += 1
            c += 1
        #for distance in self.stgrEdges:
        #	fstgr.write(str(distance) + '\t' + str(self.stgrEdges[distance]) + '\n')
        #fstgr.close()
        #print len(self.actEdgesTuples)
        LOGGER.debug('filling network')
        for _id, loc in dataset.user_home_location_iter():
            #_id = user['user_id']
            #loc = UserProfilingMethod.dataset.user_home_location_iter()
            #loc, pd = utils.get_post_data(user['posts'])
            #l_a = utils.is_geocoded(user, self.min_geotag)
            #counter.update(loc) ### counter
            #if not self.X.__contains__(_id):
            #self.X.add_node(_id)
            if loc[0] == 0 and loc[1] == 0:
                continue
            else:
                try:
                    self.X.add_node(_id)
                except:
                    pass
            l_a = loc
            #if not l_a:	continue
            self.add_user_data(_id, l_a, {})
            le = utils.location_error(l_a, loc, LocRes)
            self.set_loc_err(_id, le)

            # remove mentions of itself
            if self.X.has_edge(_id, _id):
                self.X.rm_edge(_id, _id)

        LOGGER.debug(str(self.X.__len__()) + 'users')
        LOGGER.debug(str(self.X.size()) + 'edges')

        self.set_d_a_for_all()

        tempx = []
        tempy = []
        for u, x in self.iter_contacts():
            tempx.append(self.get_contact_vector(u, x))
            tempy.append(self.get_d_a(u, x))
        X = np.array(tempx)
        Y = np.array(tempy)
        #X = np.array([self.get_contact_vector(u,x)
        #				for u, x in self.iter_contacts()])
        #Y = np.array([self.get_d_a(u,x)
        #				for u, x in self.iter_contacts()])

        LOGGER.debug('number of relationships' + str(len(X)))

        LOGGER.debug("fitting")
        start = timeit.default_timer()
        #try:
        self.fit(X, Y)
        #except:
        #	raise RuntimeError, 'No connections to train on.'

        LOGGER.debug('done fitting tree -' +
                     str(timeit.default_timer() - start) + 'sec')

        start = timeit.default_timer()
        self.quantile_boundaries(X)
        LOGGER.debug('done setting quantile boundaries -' +
                     str(timeit.default_timer() - start) + 'sec')

        start = timeit.default_timer()
        self.fit_curves(Y)
        LOGGER.debug('done fitting curves -' +
                     str(timeit.default_timer() - start) + 'sec')

        #self.model.allActEdges = self.allActEdges
        #self.model.stgrEdges = self.stgrEdges

        self.user_to_loc = self.infer_locs()
        if model_dir is not None:
            LOGGER.debug('saving model')
            filename = os.path.join(model_dir, "user-to-lat-lon.tsv.gz")
            fh = gzip.open(filename, 'w')
            for user_id, loc in self.user_to_loc.iteritems():
                if not loc is None:
                    fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1]))
            fh.close()

        self.model = FriendlyLocation_Model(self.user_to_loc)
        return self.model