def __init__(self, settings): """ Initializing class variables. """ # the mention network that will store inferred locations in node_data self.mention_network = MultiLocationMethod.dataset.bi_mention_network() self.nodes = set(self.mention_network.nodes()) #self.u_n, self.u_star are the sets of users with unknown and known locations respectively. self.u_n = set() self.u_star = set() #the set of all known venues self.venues = set() #list of all locations, and the co-occurences with a user. self.psi = Counter() #alpha and beta are the coefficients for eq.1 as per the paper self.alpha = -0.55 self.beta = 0.0045 #K is the total number of tweeting relationships self.K = 0 #N_squared is the total number of user pairs self.N_squared = 0 #S is the number of following relationships self.S = 0 #geocoder is a forward/reverse geocoder for location -> lat/long and lat/lon -> location. if 'location_source' in settings: self.geocoder = Geocoder(dataset=settings['location_source']) else: self.geocoder = Geocoder() #F_r is the random following model Bernoulli distribution parameter self.F_r = None #T_r is the random tweeting model Bernoulli distribution parameter self.T_r = Counter() #mu and nu are the model selectors according to a bernoulli distribution self.mu = defaultdict(bool) self.nu = defaultdict(bool) #the multi-location list generated by the MLP self.user_multi_locations = defaultdict(list) #runs the model, populates all the variables and generates user_multi_locations self.run_model()
def train_model(self, settings, dataset, model_dir): # Initialize the geocoder, which we'll use to resolve location strings. # We use the default name-to-location mapping unless the user has # specified otherwise. if 'location_source' in settings: self.geocoder = Geocoder(dataset=settings['location_source']) else: self.geocoder = Geocoder() # NOTE: The original paper used the directional friends/followers # network. However, the paper was tested on a much smaller network # (9.8M edges), which doesn't scale when including the full network. We # opt for using the bi-directional networks as these (1) provide a # stronger signal of social relationships and (2) significantly reduce # the memory requirement. LOGGER.debug('Loading mention network') mention_network = dataset.bi_mention_network() # This dict will contain a mapping from user ID to an associated home # location, which is derived either from the location field (as in the # original paper), from GPS-tagged tweets, or from both user_to_home_loc = {} # For each of the users that we have in the network, see if we can # associate that user with a home location. all_users = set(mention_network.nodes_iter()) LOGGER.debug('Calculating users with recognizable home location') num_users_processed = 0 # Keep track of how many times each location occurred. We'll filter # this down to only the most common locations location_counts = collections.Counter() for user_id, home_loc in dataset.user_home_location_iter(): if not user_id in all_users: continue # home_loc is a (lat,lon) tuple. While this is accurate, we want to # coarsen the location data to decrease sparsity (i.e., more people # located in the same city location, despite slightly different # underlying lat/lon values). Here, use the Geocoder to map the # lat/lon to a name and then back to a canonical lat/lon for that # name canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1]) location_counts[canonical_lat_lon] += 1 user_to_home_loc[user_id] = canonical_lat_lon num_users_processed += 1 if num_users_processed % 500000 == 0: LOGGER.debug('Processed %s of the %s users, associated %s a known location (%s)' % (num_users_processed, len(all_users), len(user_to_home_loc), len(user_to_home_loc) / float(num_users_processed))) # Iterate through the locations pruning out those that do not occur more # than some threshold number of times num_locs_removed = 0 for lat_lon, count in location_counts.iteritems(): if count >= 20: self.unique_locations.add(lat_lon) else: num_locs_removed += 1 LOGGER.debug('Saw %d locations, %d with at least 5 users, %d to be pruned' % (len(location_counts), len(self.unique_locations), num_locs_removed)) # Remove the home locations of users whose locations aren't in the # pruned list of minimum-frequency locations num_user_home_locs_removed = 0 for user_id, loc in user_to_home_loc.items(): if not loc in self.unique_locations: del user_to_home_loc[user_id] num_user_home_locs_removed += 1 LOGGER.debug('After pruning removed home locations of %d users, %d still have homes' % (num_user_home_locs_removed, len(user_to_home_loc))) # Create a bi-directional mapping from locations to unique # numeric identifiers. This mapping will be used when # representing locations in the classifier feature space and # when converting classifier output to specific locations location_to_id = {} for loc in self.unique_locations: id_ = len(location_to_id) location_to_id[loc] = id_ self.id_to_location[id_] = loc # Associate each location with its set of features n = len(self.unique_locations) # Each location has 7 features associated with it for classifying a # user's location. The seven features per location are arranged next to # each other in the feature space. feature_offset = 0 for loc in self.unique_locations: # Feat1: it's population bin (size approx.) self.pop_bin_feature_indices[loc] = feature_offset # Feat2: the number of reciprocal friends self.reciprocal_feature_indices[loc] = feature_offset + 1 # Feat3-7: the bins indicating how many friends were in reciprocal # triads in that city for bin_num in range(0, 5): feat = "%s,%s:%s" % (loc[0], loc[1], bin_num) self.triad_feature_indices[feat] = feature_offset + bin_num + 2 # Increment the feature offset so the next city's features don't # collide with this city's indices feature_offset += 7 # Set the total number of features seen self.total_num_features = feature_offset LOGGER.debug('Saw %d unique locations, %d total featurs' % (len(self.unique_locations), feature_offset)) LOGGER.debug('Associated %s of the %s users with a known location (%s unique)' % (len(user_to_home_loc), len(all_users), len(self.unique_locations))) # The list of locations for each corresponding user in X B = [] # Train the classifier based on users with known home locations LOGGER.debug("Generating feature vectors for training") X = scipy.sparse.lil_matrix((len(user_to_home_loc), self.total_num_features), dtype=numpy.float64) print X row = 0 total_nz = 0 for user_id, location in user_to_home_loc.iteritems(): # Skip users whose locations were omitted due to frequency filtering # or who have home locations but are not in the mention network #if not location in self.unique_locations or not user_id in all_users: # continue # Fill the row in the matrix corresponding to this user's features nz = self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, row) total_nz += nz # Get the index of this user's location location_id = location_to_id[location] B.append(location_id) row += 1 X = X.tocsr() #X = X.toarray() LOGGER.debug("Generated training data for %d users, %d nz features, %f on average" % (row, total_nz, float(total_nz) / row)) # Convert the location list into a numpy array for use with scikit Y = numpy.asarray(B) if len(X.nonzero()[0]) == 0: LOGGER.warning("Too little training data seen and no user had non-zero feature "+ "values. Cowardly aborting classification") else: # Use SVM classifier with a linear kernel. # # NOTE NOTE NOTE NOTE # # The original paper uses an RBF kernel with their SVM. However, # this proved impossibly slow during testing, so a linear kernel was # used instead. # # NOTE NOTE NOTE NOTE # # slow: self.location_classifier = svm.SVC(kernel='rbf') #self.location_classifier = svm.LinearSVC(dual=False) #self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000) #self.location_classifier = naive_bayes.BernoulliNB() self.location_classifier = svm.LinearSVC(dual=False, loss='l2', penalty="l2", tol=1e-2) # Note: we expect the vector representations to be sparse, so avoid mean # scaling since it would create dense vectors, which would blow up the # memory consumption of the model self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False) # Learn the scaling parameters and then rescale the input LOGGER.debug("Scaling feature vectors for training") X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64)) LOGGER.debug("Training classifier") self.location_classifier.fit(X_scaled, Y) LOGGER.debug("Finished training classifier") # Assign all the users some location, if we can figure it out users_assigned = 0 users_seen = 0 for user_id in all_users: users_seen += 1 # If we know where to place this user, assign it to their home location if user_id in user_to_home_loc: self.user_id_to_location[user_id] = user_to_home_loc[user_id] # Otherwise try to infer the location else: location = self.infer_location(user_id, mention_network, user_to_home_loc) if not location is None: self.user_id_to_location[user_id] = location users_assigned += 1 if users_seen % 100000 == 0: LOGGER.debug((("Saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)") % (users_seen, len(all_users), len(self.user_id_to_location) - users_assigned, users_assigned, len(self.user_id_to_location)))) LOGGER.debug((("Ultimately saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)") % (users_seen, len(all_users), len(self.user_id_to_location) - users_assigned, users_assigned, len(self.user_id_to_location)))) # Short circuit early if the caller has specified that the model is not # to be saved into a directory if model_dir is None: return Wheres_Wally_Model(self.user_id_to_location) if not os.path.exists(model_dir): os.mkdir(model_dir) # Write the .tsv for human debugability too fh = gzip.open(os.path.join(model_dir, 'user-to-lat-lon.tsv.gz'), 'w') for user_id, loc in self.user_id_to_location.iteritems(): fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1])); fh.close() return Wheres_Wally_Model(self.user_id_to_location)
def train_model(self, settings, dataset, model_dir=None): # settings in the form '''{ 'LCR_min_dist' : the cutoff distance to distinguish between local and non-local contacts (default = 40 km ~ 25 miles) 'qntl_num' : the number of quantiles (default is 10) 'min_geotag' : the minimum number of geotags that makes a user a target (deafault = 3) 'min_samples_leaf' : the maximum number of sample in a leaf of the regression tree, i.e: the minimum for the regressor to not split a leaf (default = 1000) }''' self.min_dist = settings.pop('LCR_min_dist', 40) self.m = settings.pop('qntl_num', 10) self.min_geotag = settings.pop('min_geotag', 3) min_samp_leaf = settings.pop('min_samples_leaf', 1000) LOGGER.debug('tree') self.tree = DecisionTreeRegressor( min_samples_leaf=min_samp_leaf) # the classifier LOGGER.debug('geocoder') #LocRes = Geocoder() if 'location_source' in settings: LocRes = Geocoder(dataset=settings['location_source']) else: LocRes = Geocoder() LOGGER.debug('loading mention network') self.X = dataset.mention_network(bidirectional=True, directed=True, weighted=True) #print len(self.X) #counter = set_counter('has at least %d geotags'%self.min_geotag) ### counter # adding users self.user_to_home_loc = { user: loc for (user, loc) in dataset.user_home_location_iter() } user_loc_list = self.user_to_home_loc.items() random.shuffle(user_loc_list) #print len(user_loc_list) #Take a sample from user home locations to estimate stgrEdges and actEdges start = time.time() user_loc_list = user_loc_list[:50000] #print 'home loc time:' #print len(user_loc_list) #fstgr = open('stgr_edges.tsv', 'w') c = 0 LOGGER.debug('sampling stranger edges and actual edges') for uid1, loc1 in user_loc_list: #if c % 100 == 0: # print c c2 = 0 for uid2, loc2 in user_loc_list: if not c2 == c: if self.X.has_edge(uid1, uid2): self.actEdgesTuples.append((uid1, uid2)) distance = round(utils.distance(loc1, loc2), 1) self.stgrEdges[distance] += 1 c2 += 1 c += 1 #for distance in self.stgrEdges: # fstgr.write(str(distance) + '\t' + str(self.stgrEdges[distance]) + '\n') #fstgr.close() #print len(self.actEdgesTuples) LOGGER.debug('filling network') for _id, loc in dataset.user_home_location_iter(): #_id = user['user_id'] #loc = UserProfilingMethod.dataset.user_home_location_iter() #loc, pd = utils.get_post_data(user['posts']) #l_a = utils.is_geocoded(user, self.min_geotag) #counter.update(loc) ### counter #if not self.X.__contains__(_id): #self.X.add_node(_id) if loc[0] == 0 and loc[1] == 0: continue else: try: self.X.add_node(_id) except: pass l_a = loc #if not l_a: continue self.add_user_data(_id, l_a, {}) le = utils.location_error(l_a, loc, LocRes) self.set_loc_err(_id, le) # remove mentions of itself if self.X.has_edge(_id, _id): self.X.rm_edge(_id, _id) LOGGER.debug(str(self.X.__len__()) + 'users') LOGGER.debug(str(self.X.size()) + 'edges') self.set_d_a_for_all() tempx = [] tempy = [] for u, x in self.iter_contacts(): tempx.append(self.get_contact_vector(u, x)) tempy.append(self.get_d_a(u, x)) X = np.array(tempx) Y = np.array(tempy) #X = np.array([self.get_contact_vector(u,x) # for u, x in self.iter_contacts()]) #Y = np.array([self.get_d_a(u,x) # for u, x in self.iter_contacts()]) LOGGER.debug('number of relationships' + str(len(X))) LOGGER.debug("fitting") start = timeit.default_timer() #try: self.fit(X, Y) #except: # raise RuntimeError, 'No connections to train on.' LOGGER.debug('done fitting tree -' + str(timeit.default_timer() - start) + 'sec') start = timeit.default_timer() self.quantile_boundaries(X) LOGGER.debug('done setting quantile boundaries -' + str(timeit.default_timer() - start) + 'sec') start = timeit.default_timer() self.fit_curves(Y) LOGGER.debug('done fitting curves -' + str(timeit.default_timer() - start) + 'sec') #self.model.allActEdges = self.allActEdges #self.model.stgrEdges = self.stgrEdges self.user_to_loc = self.infer_locs() if model_dir is not None: LOGGER.debug('saving model') filename = os.path.join(model_dir, "user-to-lat-lon.tsv.gz") fh = gzip.open(filename, 'w') for user_id, loc in self.user_to_loc.iteritems(): if not loc is None: fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1])) fh.close() self.model = FriendlyLocation_Model(self.user_to_loc) return self.model