def lookup_contacts(contact_uids,mdists,env): """ lookup user profiles for contacts or leafs """ twit = twitter.TwitterResource() gis = gisgraphy.GisgraphyResource() gis.set_mdists(mdists) # FIXME: we need a better way to know which file we are on. # FIXME: use the new input_paths thing first, contact_uids = utils.peek(contact_uids) group = User.mod_id(first) logging.info('lookup old uids for %s',group) save_name = 'saved_users.%s'%group if env.name_exists(save_name): stored = set(env.load(save_name)) else: stored = User.mod_id_set(int(group)) logging.info('loaded mod_group %s of %d users',group,len(stored)) missing = (id for id in contact_uids if id not in stored) chunks = utils.grouper(100, missing, dontfill=True) for chunk in chunks: users = twit.user_lookup(user_ids=list(chunk)) for amigo in filter(None,users): assert User.mod_id(amigo._id)==group amigo.geonames_place = gis.twitter_loc(amigo.location) amigo.merge() yield len(users)
def trash_extra_mloc(mloc_uids): "remove the mloc_users that mloc_uids skipped over" # This scares me a bit, but it's too late to go back and fix find_contacts. # I really wish I had limited find_contacts to stop after 2500 good users. db = User.database mloc_uids = set(mloc_uids) group_ = set(uid%100 for uid in mloc_uids) assert len(group_)==1 group = next(iter(group_)) stored = User.mod_id_set(group) trash = list(stored - mloc_uids) logging.info("trashing %d users",len(trash)) logging.debug("full list: %r",trash) db.Edges.remove({'_id':{'$in':trash}}) db.Tweets.remove({'_id':{'$in':trash}}) db.User.remove({'_id':{'$in':trash}})