def create_folds(args): parser = argparse.ArgumentParser(prog='geoinf create_folds', description='creates a set of data partitions for evaluating with cross-fold validation') parser.add_argument('-f', '--force', help='overwrite the output model directory if it already exists') parser.add_argument('dataset_dir', help='a directory containing a geoinference dataset') parser.add_argument('num_folds', help='the number of folds into which the dataset should be divided') parser.add_argument('fold_dir', help='a (non-existent) directory that will contain the information on the cross-validation folds') args = parser.parse_args(args) # Confirm that the output directory doesn't exist if not os.path.exists(args.fold_dir): #and not args.force: #raise Exception, 'output fold_dir cannot already exist' os.mkdir(args.fold_dir) # Decide on the number of folds num_folds = int(args.num_folds) if num_folds <= 1: raise Exception, 'The number of folds must be at least two' # Initialize the output streams. Rather than keeping things in memory, # we batch the gold standard posts by users (one at a time) and then # stream the user's gold standard posts (if any) to the output streams output_held_out_post_ids_file_handles = [] output_held_out_user_ids_file_handles = [] output_gold_loc_file_handles = [] output_posts_file_handles = [] cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w') for i in range(0, num_folds): fold_name = "fold_%d" % i # All the IDs of the gold posts in this fold are written here fold_posts_ids_fh = open(os.path.join(args.fold_dir, fold_name + ".post-ids.txt"), 'w') output_held_out_post_ids_file_handles.append(fold_posts_ids_fh) # All the IDs of the users with gold posts are written here fold_users_ids_fh = open(os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w') output_held_out_user_ids_file_handles.append(fold_users_ids_fh) # All the lat/lon and IDs of the gold posts are written here gold_loc_fh = open(os.path.join(args.fold_dir, fold_name + ".gold-locations.tsv"), 'w') output_gold_loc_file_handles.append(gold_loc_fh) # The users.json.gz file with the gold data (used for testing) gold_loc_fh = gzip.open(os.path.join(args.fold_dir, fold_name + ".users.json.gz"), 'w') output_posts_file_handles.append(gold_loc_fh) cf_info_fh.write("%s\t%s.post-ids.txt\t%s.user-ids.txt\t%s.users.json.gz\n" % (fold_name, fold_name, fold_name, fold_name)) cf_info_fh.close() # Load the dataset ds = SparseDataset(args.dataset_dir) logger.debug('Extracting gold-standard posts') num_users = 0 num_posts = 0 num_gold_users = 0 num_gold_posts = 0 # Iterate over the dataset looking for posts with geo IDs that we can # use as a gold standard for user in ds.user_iter(): gold_posts = [] gold_post_id_to_loc = {} user_id = user['user_id'] num_posts += len(user['posts']) for post in user['posts']: if "geo" in post: post_id = post['id'] loc = post['geo']['coordinates'] gold_post_id_to_loc[post_id] = loc gold_posts.append(post) # If this user had any gold locations, add them as folds if len(gold_posts) > 0: num_gold_posts += len(gold_posts) fold_to_use = num_gold_users % num_folds num_gold_users += 1 output_held_out_user_ids_file_handles[fold_to_use].write("%s\n" % user['user_id']) for post_id, loc in gold_post_id_to_loc.iteritems(): output_held_out_post_ids_file_handles[fold_to_use].write("%d\n" % post_id) output_gold_loc_file_handles[fold_to_use].write("%d\t%s\t%f\t%f\n" % (post_id, user_id, loc[0], loc[1])) # Lazily mutate the existing user object and the dump # that object to the fold's user.json.gz user['posts'] = gold_posts output_posts_file_handles[fold_to_use].write("%s\n" % simplejson.dumps(user)) num_users += 1 if num_users % 100000 == 0: logger.debug('Processed %d users, saw %d gold so far (%d posts of %d (%f))' % (num_users, num_gold_users, num_gold_posts, num_posts, float(num_gold_posts) / num_posts)) for fh in output_posts_file_handles: fh.close() for fh in output_held_out_post_ids_file_handles: fh.close() for fh in output_held_out_user_ids_file_handles: fh.close() for fh in output_gold_loc_file_handles: fh.close() logger.debug('Saw %d gold standard users in %d total' % (num_gold_users, num_users))
def infer(args,by_user=False): prog_name = 'geoinf' if by_user: description='infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method grouped by user.' prog_name += ' infer_by_user' else: description='infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method one at a time.' prog_name += ' infer_by_post' parser = argparse.ArgumentParser(prog=prog_name,description=description) parser.add_argument('-f','--force',action='store_true',help='overwrite the output file if it already exists') parser.add_argument('-s','--settings',help='a json file of settings to be passed to the model',nargs=1) parser.add_argument('method_name',help='the type of method to use for inference') parser.add_argument('model_dir',help='the directory of a model that was constructed using the train procedure') parser.add_argument('dataset',help='a json specification for the dataset to infer locations on') parser.add_argument('infer_file',help='the file that the inferences will be written to') logger.debug('infer args = %s' % str(args)) args = parser.parse_args(args) # load the infer settings if necessary settings = {} if args.settings: with open(args.settings,'r') as fh: settings = json.load(fh) if os.path.exists(args.infer_file) and not args.force: raise Exception, 'output infer_file cannot exist' # load the method method = get_method_by_name(args.method_name) method_inst = method() model = method_inst.load_model(args.model_dir,settings) # load the dataset ds = SparseDataset(args.dataset) # get the output file ready outfh = open(args.infer_file,'w') # write settings to the first line outfh.write('%s\n' % json.dumps({'method': args.method_name, 'settings': settings, 'dataset': args.dataset, 'by_user': by_user})) # locate all the posts logger.info('inferring locations for posts') if by_user: num_posts_seen = 0 num_posts_located = 0 num_users_seen = 0 for user in ds.user_iter(): user_id = user['user_id'] posts = user['posts'] locs = model.infer_posts_locations_by_user(user_id,posts) assert len(locs) == len(posts) num_users_seen += 1 for loc,post in zip(locs,posts): num_posts_seen += 1 if not loc is None: num_posts_located += 1 outfh.write('%s\t%f\t%f\n' % (post['id'],loc[0],loc[1])) if num_posts_seen % 10000 == 0: logger.debug("Saw %d users, %d posts, %d of which were located" % (num_users_seen, num_posts_seen, num_posts_located)) else: num_posts_seen = 0 num_posts_located = 0 for post in ds.post_iter(): user_id = post['user']['id_str'] loc = model.infer_post_location(post) num_posts_seen += 1 if not loc is None: outfh.write('%s\t%f\t%f\n' % (post['id'],loc[0],loc[1])) num_posts_located += 1 if num_posts_seen % 10000 == 0: logger.debug("Saw %d posts, %d of which were located" % (num_posts_seen, num_posts_located)) outfh.close()
def infer(args, by_user=False): prog_name = 'geoinf' if by_user: description = 'infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method grouped by user.' prog_name += ' infer_by_user' else: description = 'infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method one at a time.' prog_name += ' infer_by_post' parser = argparse.ArgumentParser(prog=prog_name, description=description) parser.add_argument('-f', '--force', action='store_true', help='overwrite the output file if it already exists') parser.add_argument( '-s', '--settings', help='a json file of settings to be passed to the model', nargs=1) parser.add_argument('method_name', help='the type of method to use for inference') parser.add_argument( 'model_dir', help= 'the directory of a model that was constructed using the train procedure' ) parser.add_argument( 'dataset', help='a json specification for the dataset to infer locations on') parser.add_argument('infer_file', help='the file that the inferences will be written to') logger.debug('infer args = %s' % str(args)) args = parser.parse_args(args) # load the infer settings if necessary settings = {} if args.settings: with open(args.settings, 'r') as fh: settings = json.load(fh) if os.path.exists(args.infer_file) and not args.force: raise Exception, 'output infer_file cannot exist' # load the method method = get_method_by_name(args.method_name) method_inst = method() model = method_inst.load_model(args.model_dir, settings) # load the dataset ds = SparseDataset(args.dataset) # get the output file ready outfh = open(args.infer_file, 'w') # write settings to the first line outfh.write('%s\n' % json.dumps({ 'method': args.method_name, 'settings': settings, 'dataset': args.dataset, 'by_user': by_user })) # locate all the posts logger.info('inferring locations for posts') if by_user: num_posts_seen = 0 num_posts_located = 0 num_users_seen = 0 for user in ds.user_iter(): user_id = user['user_id'] posts = user['posts'] locs = model.infer_posts_locations_by_user(user_id, posts) assert len(locs) == len(posts) num_users_seen += 1 for loc, post in zip(locs, posts): num_posts_seen += 1 if not loc is None: num_posts_located += 1 outfh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1])) if num_posts_seen % 10000 == 0: logger.debug( "Saw %d users, %d posts, %d of which were located" % (num_users_seen, num_posts_seen, num_posts_located)) else: num_posts_seen = 0 num_posts_located = 0 for post in ds.post_iter(): user_id = post['user']['id_str'] loc = model.infer_post_location(post) num_posts_seen += 1 if not loc is None: outfh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1])) num_posts_located += 1 if num_posts_seen % 10000 == 0: logger.debug("Saw %d posts, %d of which were located" % (num_posts_seen, num_posts_located)) outfh.close()
def create_folds(args): parser = argparse.ArgumentParser( prog='geoinf create_folds', description= 'creates a set of data partitions for evaluating with cross-fold validation' ) parser.add_argument( '-f', '--force', help='overwrite the output model directory if it already exists') parser.add_argument('dataset_dir', help='a directory containing a geoinference dataset') parser.add_argument( 'num_folds', help='the number of folds into which the dataset should be divided') parser.add_argument( 'fold_dir', help= 'a (non-existent) directory that will contain the information on the cross-validation folds' ) args = parser.parse_args(args) # Confirm that the output directory doesn't exist if not os.path.exists(args.fold_dir): #and not args.force: #raise Exception, 'output fold_dir cannot already exist' os.mkdir(args.fold_dir) # Decide on the number of folds num_folds = int(args.num_folds) if num_folds <= 1: raise Exception, 'The number of folds must be at least two' # Initialize the output streams. Rather than keeping things in memory, # we batch the gold standard posts by users (one at a time) and then # stream the user's gold standard posts (if any) to the output streams output_held_out_post_ids_file_handles = [] output_held_out_user_ids_file_handles = [] output_gold_loc_file_handles = [] output_posts_file_handles = [] cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w') for i in range(0, num_folds): fold_name = "fold_%d" % i # All the IDs of the gold posts in this fold are written here fold_posts_ids_fh = open( os.path.join(args.fold_dir, fold_name + ".post-ids.txt"), 'w') output_held_out_post_ids_file_handles.append(fold_posts_ids_fh) # All the IDs of the users with gold posts are written here fold_users_ids_fh = open( os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w') output_held_out_user_ids_file_handles.append(fold_users_ids_fh) # All the lat/lon and IDs of the gold posts are written here gold_loc_fh = open( os.path.join(args.fold_dir, fold_name + ".gold-locations.tsv"), 'w') output_gold_loc_file_handles.append(gold_loc_fh) # The users.json.gz file with the gold data (used for testing) gold_loc_fh = gzip.open( os.path.join(args.fold_dir, fold_name + ".users.json.gz"), 'w') output_posts_file_handles.append(gold_loc_fh) cf_info_fh.write( "%s\t%s.post-ids.txt\t%s.user-ids.txt\t%s.users.json.gz\n" % (fold_name, fold_name, fold_name, fold_name)) cf_info_fh.close() # Load the dataset ds = SparseDataset(args.dataset_dir) logger.debug('Extracting gold-standard posts') num_users = 0 num_posts = 0 num_gold_users = 0 num_gold_posts = 0 # Iterate over the dataset looking for posts with geo IDs that we can # use as a gold standard for user in ds.user_iter(): gold_posts = [] gold_post_id_to_loc = {} user_id = user['user_id'] num_posts += len(user['posts']) for post in user['posts']: if "geo" in post: post_id = post['id'] loc = post['geo']['coordinates'] gold_post_id_to_loc[post_id] = loc gold_posts.append(post) # If this user had any gold locations, add them as folds if len(gold_posts) > 0: num_gold_posts += len(gold_posts) fold_to_use = num_gold_users % num_folds num_gold_users += 1 output_held_out_user_ids_file_handles[fold_to_use].write( "%s\n" % user['user_id']) for post_id, loc in gold_post_id_to_loc.iteritems(): output_held_out_post_ids_file_handles[fold_to_use].write( "%d\n" % post_id) output_gold_loc_file_handles[fold_to_use].write( "%d\t%s\t%f\t%f\n" % (post_id, user_id, loc[0], loc[1])) # Lazily mutate the existing user object and the dump # that object to the fold's user.json.gz user['posts'] = gold_posts output_posts_file_handles[fold_to_use].write( "%s\n" % simplejson.dumps(user)) num_users += 1 if num_users % 100000 == 0: logger.debug( 'Processed %d users, saw %d gold so far (%d posts of %d (%f))' % (num_users, num_gold_users, num_gold_posts, num_posts, float(num_gold_posts) / num_posts)) for fh in output_posts_file_handles: fh.close() for fh in output_held_out_post_ids_file_handles: fh.close() for fh in output_held_out_user_ids_file_handles: fh.close() for fh in output_gold_loc_file_handles: fh.close() logger.debug('Saw %d gold standard users in %d total' % (num_gold_users, num_users))
def create_folds(args): parser = argparse.ArgumentParser(prog='geoinf create_folds', description='creates a set of data partitions for evaluating with cross-fold validation') parser.add_argument('-f', '--force', help='overwrite the output model directory if it already exists') parser.add_argument('dataset_dir', help='a directory containing a geoinference dataset') parser.add_argument('fold_dir', help='a (non-existent) directory that will contain the information on the cross-validation folds') parser.add_argument('test_case', help="What type of test wanted to run i.e. rural vs urban (county), gender (gender), or random (any other string)") args = parser.parse_args(args) # Confirm that the output directory doesn't exist if not os.path.exists(args.fold_dir): #and not args.force: #raise Exception, 'output fold_dir cannot already exist' os.mkdir(args.fold_dir) ground_truth_file = "filtered_user_groundtruth_locfield.tsv" ground_truth_locs = "users.home-locations.loc-field.tsv.gz" # Decide on the number of folds if num_folds <= 1: #raise Exception, 'The number of folds must be at least two' print("the number of folds must be at least two") if args.test_case == "gender": num_folds = NUM_MALE_FOLDS + NUM_FEMALE_FOLDS + NUM_UNKNOWN_FOLDS elif args.test_case == "county": num_folds = NUM_URBAN_FOLDS * 6 else: num_folds = NUM_RANDOM_FOLDS idToGender = {} idToUrbanLevel = {} with open(os.path.join(args.dataset_dir, ground_truth_file), "r") as gt_file: gt_file.next(); for line in gt_file: try: uid, gender, urbanLevel = line.split('\t') idToGender[uid] = gender if urbanLevel != "\r\n": idToUrbanLevel[uid] = int(urbanLevel) except: print line idToLoc = {} with gzip.open(os.path.join(args.dataset_dir, ground_truth_locs), "r") as gt_file: gt_file.next() for line in gt_file: uid, lat, lon = line.split('\t') idToLoc[uid] = (lat, lon) # Initialize the output streams. Rather than keeping things in memory, # we batch the gold standard posts by users (one at a time) and then # stream the user's gold standard posts (if any) to the output streams output_user_ids_file_handles = [] cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w') for i in range(0, num_folds): fold_name = "fold_%d" % i # All the IDs of the users with gold posts are written here fold_users_ids_fh = open(os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w') output_user_ids_file_handles.append(fold_users_ids_file_handles) cf_info_fh.write("%s\t%s.user-ids.txt" % (fold_name, fold_name)) cf_info_fh.close() # Load the dataset ds = SparseDataset(args.dataset_dir) if args.test_case == "gender": female_users = [] male_users = [] unknown_users = [] for user in ds.user_iter(): user_id = user['user_id'] usergender = idToGender.get(str(user_id), -1) # If this user had any gold locations, add them as folds if usergender != -1: #determine fold to use if userGender == "m": male_users.append(user_id) elif userGender == "f": female_users.append(user_id) else: unknown_users.append(user_id) currentFold = 0 male_folds = generate_folds(male_users, NUM_MALE_FOLDS) for fold in male_folds: write_fold(fold, currentFold, idToLoc, output_user_ids_file_handles) currentFold += 1 female_folds = generate_folds(female_users, NUM_FEMALE_FOLDS) for fold in female_folds: write_fold(fold, currentFold, idToLoc, output_user_ids_file_handles) currentFold += 1 unknown_folds = generate_folds(unknown_users, NUM_UNKNOWN_FOLDS) for fold in unknown_folds: write_fold(fold, currentFold, idToLoc, output_user_ids_file_handles) currentFold += 1 elif args.test_case == "county": usersAtLevel = [] for i in range(1, 7): usersAtLevel[i] = [] for user in ds.user_iter(): user_id = user['user_id'] urbanRuralLevel = idToUrbanLevel.get(str(user,id) -1) # If this user had any gold locations, add them as folds if urbanRuralLevel != -1: usersAtLevel[urbanRuralLevel].append(user_id) currentFoldIndex = 0 for i in range(1,7): currentFolds = generate_folds(usersAtLevel[i], NUM_URBAN_FOLDS) for fold in currentFolds: write_fold(fold, currentFoldIndex, idToLoc, output_users_ids_file_handles) currentFoldIndex += 1 else: # Iterate over the dataset looking for posts with geo IDs that we can # use as a gold standard for user in ds.user_iter(): gold_users = [] user_id = user['user_id'] gender = idToGender.get(str(user_id), -1) # If this user had any gold locations, add them as folds if gender != -1: gold_users.append(uid) currentFoldIndex = 0 currentFolds = generate_folds(gold_users, NUM_RANDOM_FOLDS) for fold in currentFolds: write_fold(fold, currentFoldIndex, idToLoc, output_users_ids_file_handles) currentFoldIndex += 1 for fh in output_user_ids_file_handles: fh.close()