def pre_processing(self): print 'preprocessing...' # uniformization raw = numpy.concatenate([ self.train_set.sparse_matrix.data, self.valid_set.sparse_matrix.data, self.test_set.sparse_matrix.data ]) len_train = len(self.train_set.sparse_matrix.data) len_valid = len(self.valid_set.sparse_matrix.data) len_test = len(self.test_set.sparse_matrix.data) out = data_processing.uniformization(raw, False) self.train_set.sparse_matrix.data = raw[0:len_train] self.valid_set.sparse_matrix.data = raw[len_train:(len_train + len_valid)] self.test_set.sparse_matrix.data = raw[-len_test:] self.full_train = scipy.sparse.vstack( [self.train_set.sparse_matrix, self.valid_set.sparse_matrix], 'csr') # shuffling train set self.full_train = self.full_train[ numpy.random.permutation(self.full_train.shape[0]), :] # feature subset selection self.full_train = self.full_train[:, self.features_selected] self.valid_set = self.valid_set.sparse_matrix[:, self.features_selected] self.test_set = self.test_set.sparse_matrix[:, self.features_selected] # whitening std = numpy.std(self.full_train.data) self.full_train /= std self.valid_set /= std self.test_set /= std # finally self.trainset = SparseDataset( from_scipy_sparse_dataset=self.full_train) self.validset = SparseDataset(from_scipy_sparse_dataset=self.valid_set) self.testset = SparseDataset(from_scipy_sparse_dataset=self.test_set)
def torch_loader(dataset, data_path, batch_size, shuffle=True, cuda_device=None, num_workers=1): (train_data, val_data), (train_labels, val_labels), label_names = load_data_func( dataset, data_path) kwargs = { 'num_workers': num_workers, 'pin_memory': True } if cuda_device is not None else {} kwargs['drop_last'] = True if type(train_data) == numpy.ndarray: train_dataset = TensorDataset(torch.from_numpy(train_data), torch.from_numpy(train_labels)) val_dataset = TensorDataset(torch.from_numpy(val_data), torch.from_numpy(val_labels)) elif type(train_data) == scipy.sparse.csr.csr_matrix: from sklearn.feature_extraction.text import TfidfTransformer tfidf_trans = TfidfTransformer(norm=None) tfidf_trans.fit(train_data) train_dataset = SparseDataset(train_data, tfidf_trans.idf_) val_dataset = SparseDataset(val_data, tfidf_trans.idf_) else: train_dataset = torchvision.datasets.ImageFolder(train_data) val_dataset = torchvision.datasets.ImageFolder(val_data) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, **kwargs) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, **kwargs) return train_loader, val_loader, label_names
def __init__(self): self.trainset_path = '/data/lisa/data/UTLC/sparse/terry_train.npy.gz' self.validset_path = '/data/lisa/data/UTLC/sparse/terry_valid.npy.gz' self.testset_path = '/data/lisa/data/UTLC/sparse/terry_test.npy.gz' self.use_features_path = '/data/lisa/data/UTLC/sparse/terry_testvalid_activefeat.npy' self.features_selected = numpy.load(open(self.use_features_path)) # these are sets before preprocessing self.train_set = SparseDataset(load_path=self.trainset_path) self.valid_set = SparseDataset(load_path=self.validset_path) self.test_set = SparseDataset(load_path=self.validset_path) # these are sets after preprocessing self.trainset = None self.validset = None self.testset = None #fullset = scipy.sparse.vstack((scipy.sparse.vstack((self.train_set.data, self.valid_set.data)), # self.test_set.data)) #self.full_set = SparseDataset(from_sparse_dataset=fullset) self.pre_processing()
def create_folds(args): parser = argparse.ArgumentParser(prog='geoinf create_folds', description='creates a set of data partitions for evaluating with cross-fold validation') parser.add_argument('-f', '--force', help='overwrite the output model directory if it already exists') parser.add_argument('dataset_dir', help='a directory containing a geoinference dataset') parser.add_argument('num_folds', help='the number of folds into which the dataset should be divided') parser.add_argument('fold_dir', help='a (non-existent) directory that will contain the information on the cross-validation folds') args = parser.parse_args(args) # Confirm that the output directory doesn't exist if not os.path.exists(args.fold_dir): #and not args.force: #raise Exception, 'output fold_dir cannot already exist' os.mkdir(args.fold_dir) # Decide on the number of folds num_folds = int(args.num_folds) if num_folds <= 1: raise Exception, 'The number of folds must be at least two' # Initialize the output streams. Rather than keeping things in memory, # we batch the gold standard posts by users (one at a time) and then # stream the user's gold standard posts (if any) to the output streams output_held_out_post_ids_file_handles = [] output_held_out_user_ids_file_handles = [] output_gold_loc_file_handles = [] output_posts_file_handles = [] cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w') for i in range(0, num_folds): fold_name = "fold_%d" % i # All the IDs of the gold posts in this fold are written here fold_posts_ids_fh = open(os.path.join(args.fold_dir, fold_name + ".post-ids.txt"), 'w') output_held_out_post_ids_file_handles.append(fold_posts_ids_fh) # All the IDs of the users with gold posts are written here fold_users_ids_fh = open(os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w') output_held_out_user_ids_file_handles.append(fold_users_ids_fh) # All the lat/lon and IDs of the gold posts are written here gold_loc_fh = open(os.path.join(args.fold_dir, fold_name + ".gold-locations.tsv"), 'w') output_gold_loc_file_handles.append(gold_loc_fh) # The users.json.gz file with the gold data (used for testing) gold_loc_fh = gzip.open(os.path.join(args.fold_dir, fold_name + ".users.json.gz"), 'w') output_posts_file_handles.append(gold_loc_fh) cf_info_fh.write("%s\t%s.post-ids.txt\t%s.user-ids.txt\t%s.users.json.gz\n" % (fold_name, fold_name, fold_name, fold_name)) cf_info_fh.close() # Load the dataset ds = SparseDataset(args.dataset_dir) logger.debug('Extracting gold-standard posts') num_users = 0 num_posts = 0 num_gold_users = 0 num_gold_posts = 0 # Iterate over the dataset looking for posts with geo IDs that we can # use as a gold standard for user in ds.user_iter(): gold_posts = [] gold_post_id_to_loc = {} user_id = user['user_id'] num_posts += len(user['posts']) for post in user['posts']: if "geo" in post: post_id = post['id'] loc = post['geo']['coordinates'] gold_post_id_to_loc[post_id] = loc gold_posts.append(post) # If this user had any gold locations, add them as folds if len(gold_posts) > 0: num_gold_posts += len(gold_posts) fold_to_use = num_gold_users % num_folds num_gold_users += 1 output_held_out_user_ids_file_handles[fold_to_use].write("%s\n" % user['user_id']) for post_id, loc in gold_post_id_to_loc.iteritems(): output_held_out_post_ids_file_handles[fold_to_use].write("%d\n" % post_id) output_gold_loc_file_handles[fold_to_use].write("%d\t%s\t%f\t%f\n" % (post_id, user_id, loc[0], loc[1])) # Lazily mutate the existing user object and the dump # that object to the fold's user.json.gz user['posts'] = gold_posts output_posts_file_handles[fold_to_use].write("%s\n" % simplejson.dumps(user)) num_users += 1 if num_users % 100000 == 0: logger.debug('Processed %d users, saw %d gold so far (%d posts of %d (%f))' % (num_users, num_gold_users, num_gold_posts, num_posts, float(num_gold_posts) / num_posts)) for fh in output_posts_file_handles: fh.close() for fh in output_held_out_post_ids_file_handles: fh.close() for fh in output_held_out_user_ids_file_handles: fh.close() for fh in output_gold_loc_file_handles: fh.close() logger.debug('Saw %d gold standard users in %d total' % (num_gold_users, num_users))
def infer(args,by_user=False): prog_name = 'geoinf' if by_user: description='infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method grouped by user.' prog_name += ' infer_by_user' else: description='infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method one at a time.' prog_name += ' infer_by_post' parser = argparse.ArgumentParser(prog=prog_name,description=description) parser.add_argument('-f','--force',action='store_true',help='overwrite the output file if it already exists') parser.add_argument('-s','--settings',help='a json file of settings to be passed to the model',nargs=1) parser.add_argument('method_name',help='the type of method to use for inference') parser.add_argument('model_dir',help='the directory of a model that was constructed using the train procedure') parser.add_argument('dataset',help='a json specification for the dataset to infer locations on') parser.add_argument('infer_file',help='the file that the inferences will be written to') logger.debug('infer args = %s' % str(args)) args = parser.parse_args(args) # load the infer settings if necessary settings = {} if args.settings: with open(args.settings,'r') as fh: settings = json.load(fh) if os.path.exists(args.infer_file) and not args.force: raise Exception, 'output infer_file cannot exist' # load the method method = get_method_by_name(args.method_name) method_inst = method() model = method_inst.load_model(args.model_dir,settings) # load the dataset ds = SparseDataset(args.dataset) # get the output file ready outfh = open(args.infer_file,'w') # write settings to the first line outfh.write('%s\n' % json.dumps({'method': args.method_name, 'settings': settings, 'dataset': args.dataset, 'by_user': by_user})) # locate all the posts logger.info('inferring locations for posts') if by_user: num_posts_seen = 0 num_posts_located = 0 num_users_seen = 0 for user in ds.user_iter(): user_id = user['user_id'] posts = user['posts'] locs = model.infer_posts_locations_by_user(user_id,posts) assert len(locs) == len(posts) num_users_seen += 1 for loc,post in zip(locs,posts): num_posts_seen += 1 if not loc is None: num_posts_located += 1 outfh.write('%s\t%f\t%f\n' % (post['id'],loc[0],loc[1])) if num_posts_seen % 10000 == 0: logger.debug("Saw %d users, %d posts, %d of which were located" % (num_users_seen, num_posts_seen, num_posts_located)) else: num_posts_seen = 0 num_posts_located = 0 for post in ds.post_iter(): user_id = post['user']['id_str'] loc = model.infer_post_location(post) num_posts_seen += 1 if not loc is None: outfh.write('%s\t%f\t%f\n' % (post['id'],loc[0],loc[1])) num_posts_located += 1 if num_posts_seen % 10000 == 0: logger.debug("Saw %d posts, %d of which were located" % (num_posts_seen, num_posts_located)) outfh.close()
def create_folds(args): parser = argparse.ArgumentParser( prog='geoinf create_folds', description= 'creates a set of data partitions for evaluating with cross-fold validation' ) parser.add_argument( '-f', '--force', help='overwrite the output model directory if it already exists') parser.add_argument('dataset_dir', help='a directory containing a geoinference dataset') parser.add_argument( 'num_folds', help='the number of folds into which the dataset should be divided') parser.add_argument( 'fold_dir', help= 'a (non-existent) directory that will contain the information on the cross-validation folds' ) args = parser.parse_args(args) # Confirm that the output directory doesn't exist if not os.path.exists(args.fold_dir): #and not args.force: #raise Exception, 'output fold_dir cannot already exist' os.mkdir(args.fold_dir) # Decide on the number of folds num_folds = int(args.num_folds) if num_folds <= 1: raise Exception, 'The number of folds must be at least two' # Initialize the output streams. Rather than keeping things in memory, # we batch the gold standard posts by users (one at a time) and then # stream the user's gold standard posts (if any) to the output streams output_held_out_post_ids_file_handles = [] output_held_out_user_ids_file_handles = [] output_gold_loc_file_handles = [] output_posts_file_handles = [] cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w') for i in range(0, num_folds): fold_name = "fold_%d" % i # All the IDs of the gold posts in this fold are written here fold_posts_ids_fh = open( os.path.join(args.fold_dir, fold_name + ".post-ids.txt"), 'w') output_held_out_post_ids_file_handles.append(fold_posts_ids_fh) # All the IDs of the users with gold posts are written here fold_users_ids_fh = open( os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w') output_held_out_user_ids_file_handles.append(fold_users_ids_fh) # All the lat/lon and IDs of the gold posts are written here gold_loc_fh = open( os.path.join(args.fold_dir, fold_name + ".gold-locations.tsv"), 'w') output_gold_loc_file_handles.append(gold_loc_fh) # The users.json.gz file with the gold data (used for testing) gold_loc_fh = gzip.open( os.path.join(args.fold_dir, fold_name + ".users.json.gz"), 'w') output_posts_file_handles.append(gold_loc_fh) cf_info_fh.write( "%s\t%s.post-ids.txt\t%s.user-ids.txt\t%s.users.json.gz\n" % (fold_name, fold_name, fold_name, fold_name)) cf_info_fh.close() # Load the dataset ds = SparseDataset(args.dataset_dir) logger.debug('Extracting gold-standard posts') num_users = 0 num_posts = 0 num_gold_users = 0 num_gold_posts = 0 # Iterate over the dataset looking for posts with geo IDs that we can # use as a gold standard for user in ds.user_iter(): gold_posts = [] gold_post_id_to_loc = {} user_id = user['user_id'] num_posts += len(user['posts']) for post in user['posts']: if "geo" in post: post_id = post['id'] loc = post['geo']['coordinates'] gold_post_id_to_loc[post_id] = loc gold_posts.append(post) # If this user had any gold locations, add them as folds if len(gold_posts) > 0: num_gold_posts += len(gold_posts) fold_to_use = num_gold_users % num_folds num_gold_users += 1 output_held_out_user_ids_file_handles[fold_to_use].write( "%s\n" % user['user_id']) for post_id, loc in gold_post_id_to_loc.iteritems(): output_held_out_post_ids_file_handles[fold_to_use].write( "%d\n" % post_id) output_gold_loc_file_handles[fold_to_use].write( "%d\t%s\t%f\t%f\n" % (post_id, user_id, loc[0], loc[1])) # Lazily mutate the existing user object and the dump # that object to the fold's user.json.gz user['posts'] = gold_posts output_posts_file_handles[fold_to_use].write( "%s\n" % simplejson.dumps(user)) num_users += 1 if num_users % 100000 == 0: logger.debug( 'Processed %d users, saw %d gold so far (%d posts of %d (%f))' % (num_users, num_gold_users, num_gold_posts, num_posts, float(num_gold_posts) / num_posts)) for fh in output_posts_file_handles: fh.close() for fh in output_held_out_post_ids_file_handles: fh.close() for fh in output_held_out_user_ids_file_handles: fh.close() for fh in output_gold_loc_file_handles: fh.close() logger.debug('Saw %d gold standard users in %d total' % (num_gold_users, num_users))
def infer(args, by_user=False): prog_name = 'geoinf' if by_user: description = 'infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method grouped by user.' prog_name += ' infer_by_user' else: description = 'infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method one at a time.' prog_name += ' infer_by_post' parser = argparse.ArgumentParser(prog=prog_name, description=description) parser.add_argument('-f', '--force', action='store_true', help='overwrite the output file if it already exists') parser.add_argument( '-s', '--settings', help='a json file of settings to be passed to the model', nargs=1) parser.add_argument('method_name', help='the type of method to use for inference') parser.add_argument( 'model_dir', help= 'the directory of a model that was constructed using the train procedure' ) parser.add_argument( 'dataset', help='a json specification for the dataset to infer locations on') parser.add_argument('infer_file', help='the file that the inferences will be written to') logger.debug('infer args = %s' % str(args)) args = parser.parse_args(args) # load the infer settings if necessary settings = {} if args.settings: with open(args.settings, 'r') as fh: settings = json.load(fh) if os.path.exists(args.infer_file) and not args.force: raise Exception, 'output infer_file cannot exist' # load the method method = get_method_by_name(args.method_name) method_inst = method() model = method_inst.load_model(args.model_dir, settings) # load the dataset ds = SparseDataset(args.dataset) # get the output file ready outfh = open(args.infer_file, 'w') # write settings to the first line outfh.write('%s\n' % json.dumps({ 'method': args.method_name, 'settings': settings, 'dataset': args.dataset, 'by_user': by_user })) # locate all the posts logger.info('inferring locations for posts') if by_user: num_posts_seen = 0 num_posts_located = 0 num_users_seen = 0 for user in ds.user_iter(): user_id = user['user_id'] posts = user['posts'] locs = model.infer_posts_locations_by_user(user_id, posts) assert len(locs) == len(posts) num_users_seen += 1 for loc, post in zip(locs, posts): num_posts_seen += 1 if not loc is None: num_posts_located += 1 outfh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1])) if num_posts_seen % 10000 == 0: logger.debug( "Saw %d users, %d posts, %d of which were located" % (num_users_seen, num_posts_seen, num_posts_located)) else: num_posts_seen = 0 num_posts_located = 0 for post in ds.post_iter(): user_id = post['user']['id_str'] loc = model.infer_post_location(post) num_posts_seen += 1 if not loc is None: outfh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1])) num_posts_located += 1 if num_posts_seen % 10000 == 0: logger.debug("Saw %d posts, %d of which were located" % (num_posts_seen, num_posts_located)) outfh.close()
def train(args): parser = argparse.ArgumentParser( prog='geoinf train', description='train a geoinference method on a specific dataset') parser.add_argument( '-f', '--force', help='overwrite the output model directory if it already exists') parser.add_argument('method_name', help='the method to use') parser.add_argument( 'method_settings', help='a json file containing method-specific configurations') parser.add_argument('dataset_dir', help='a directory containing a geoinference dataset') parser.add_argument( 'model_dir', help='a (non-existing) directory where the trained model will be stored' ) parser.add_argument('--location-source', nargs=1, help='specifies the source of ground-truth locations') args = parser.parse_args(args) # confirm that the output directory doesn't exist if os.path.exists(args.model_dir) and not args.force: raise Exception, 'output model_dir cannot exist' # load the method method = get_method_by_name(args.method_name) # load the data with open(args.method_settings, 'r') as fh: settings = json.load(fh) location_source = args.location_source if location_source: location_source = location_source[0] logger.debug('Using %s as the source of ground truth location' % location_source) settings['location_source'] = location_source # load the dataset ds = None #Dataset(args.dataset_dir) if not location_source is None: ds = SparseDataset(args.dataset_dir, default_location_source=location_source) else: ds = SparseDataset(args.dataset_dir) # load the method method = get_method_by_name(args.method_name) method_inst = method() start_time = time.time() method_inst.train_model(settings, ds, args.model_dir) end_time = time.time() logger.info('Trained model %s on dataset %s in %f seconds' % (args.method_name, args.dataset_dir, end_time - start_time)) # drop some metadata into the run method # run the method # gi_inst = method() # gi_inst.train(settings,ds,args.model_dir) return
def cross_validate(args): parser = argparse.ArgumentParser( prog='geoinf cross_validate', description='evaluate a geocinference method using cross-validation') parser.add_argument( '-f', '--force', help='overwrite the output model directory if it already exists') parser.add_argument('method_name', help='the method to use') parser.add_argument( 'method_settings', help='a json file containing method-specific configurations') parser.add_argument('dataset_dir', help='a directory containing a geoinference dataset') parser.add_argument( 'fold_dir', help= 'the name of the directory containing information on the cross-validation folds' ) parser.add_argument( 'results_dir', help= 'a (non-existent) directory where the evaluation results will be stored' ) parser.add_argument('--fold', nargs=1, help='runs just that fold from the cross-fold dataset') parser.add_argument('--location-source', nargs=1, help='specifies the source of ground-truth locations') args = parser.parse_args(args) # confirm that the output directory doesn't exist # if os.path.exists(args.results_dir) and not args.force: # raise Exception, 'output results_dir cannot already exist' if not os.path.exists(args.results_dir): #and not args.force: #raise Exception, 'output fold_dir cannot already exist' os.mkdir(args.results_dir) # load the method method = get_method_by_name(args.method_name) # load the data with open(args.method_settings, 'r') as fh: settings = json.load(fh) specific_fold_to_run = args.fold if specific_fold_to_run: specific_fold_to_run = specific_fold_to_run[0] location_source = args.location_source if location_source: logger.debug('Using %s as the source of ground truth location' % location_source) location_source = location_source[0] settings['location_source'] = location_source print "running fold %s" % (specific_fold_to_run) # Load the folds to be used in the dataset cfv_fh = open(os.path.join(args.fold_dir, 'folds.info.tsv')) # Each line contains two files specifying the post IDs to be held out # from the full dataset (for that fold) and the corresponding file in # the fold_dir containing the testing data for that fold for line in cfv_fh: line = line.strip() fold_name, testing_post_ids_file, testing_user_ids_file, testing_users_file = line.split( "\t") # Skip this fold if the user has told us to run only one fold by name if specific_fold_to_run is not None and fold_name != specific_fold_to_run: continue logger.debug('starting processing of fold %s' % fold_name) # Read in the post IDs to exclude testing_post_ids = set() tpi_fh = open( os.path.join(args.fold_dir, testing_post_ids_file.replace('held-out-', ''))) for id_str in tpi_fh: testing_post_ids.add(id_str.strip()) tpi_fh.close() # Read in the user IDs to exclude testing_user_ids = set() tpi_fh = open( os.path.join(args.fold_dir, testing_user_ids_file.replace('held-out-', ''))) for id_str in tpi_fh: testing_user_ids.add(id_str.strip()) tpi_fh.close() logger.debug('Loaded %d users whose location data will be held out' % len(testing_user_ids)) # load the dataset training_data = None if not location_source is None: training_data = SparseDataset( args.dataset_dir, excluded_users=testing_user_ids, default_location_source=location_source) else: training_data = SparseDataset(args.dataset_dir, excluded_users=testing_user_ids) # load the method method = get_method_by_name(args.method_name) method_inst = method() # Create the temporary directory that will hold the model for # this fold model_dir = os.path.join(args.results_dir, fold_name) if not os.path.exists(model_dir): os.mkdir(model_dir) # Train on the datset, holding out the testing post IDs model = method_inst.train_model(settings, training_data, None) logger.debug('Finished training during fold %s; beginning testing' % fold_name) logger.debug("Reading testing data from %s" % (os.path.join(args.fold_dir, testing_users_file))) testing_data = Dataset(args.fold_dir, users_file=os.path.join(args.fold_dir, testing_users_file)) logger.debug( "Writing results to %s" % (os.path.join(args.results_dir, fold_name + ".results.tsv.gz"))) out_fh = gzip.open( os.path.join(args.results_dir, fold_name + ".results.tsv.gz"), 'w') num_tested_users = 0 num_tested_posts = 0 seen_ids = set() for user in testing_data.user_iter(): user_id = user['user_id'] posts = user['posts'] locs = model.infer_posts_locations_by_user(user_id, posts) if len(locs) != len(posts): print "#WUT %d != %d" % (len(locs), len(posts)) num_located_posts = 0 num_tested_posts += len(posts) for loc, post in zip(locs, posts): pid = post['id'] if pid in seen_ids: continue seen_ids.add(pid) if not loc is None: out_fh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1])) num_located_posts += 1 num_tested_users += 1 if num_tested_users % 10000 == 0: logger.debug( 'During testing of fold %s, processed %d users, %d posts, %d located' % (fold_name, num_tested_users, num_tested_posts, num_located_posts)) out_fh.close() logger.debug('Finished testing of fold %s' % fold_name)
def create_folds(args): parser = argparse.ArgumentParser(prog='geoinf create_folds', description='creates a set of data partitions for evaluating with cross-fold validation') parser.add_argument('-f', '--force', help='overwrite the output model directory if it already exists') parser.add_argument('dataset_dir', help='a directory containing a geoinference dataset') parser.add_argument('fold_dir', help='a (non-existent) directory that will contain the information on the cross-validation folds') parser.add_argument('test_case', help="What type of test wanted to run i.e. rural vs urban (county), gender (gender), or random (any other string)") args = parser.parse_args(args) # Confirm that the output directory doesn't exist if not os.path.exists(args.fold_dir): #and not args.force: #raise Exception, 'output fold_dir cannot already exist' os.mkdir(args.fold_dir) ground_truth_file = "filtered_user_groundtruth_locfield.tsv" ground_truth_locs = "users.home-locations.loc-field.tsv.gz" # Decide on the number of folds if num_folds <= 1: #raise Exception, 'The number of folds must be at least two' print("the number of folds must be at least two") if args.test_case == "gender": num_folds = NUM_MALE_FOLDS + NUM_FEMALE_FOLDS + NUM_UNKNOWN_FOLDS elif args.test_case == "county": num_folds = NUM_URBAN_FOLDS * 6 else: num_folds = NUM_RANDOM_FOLDS idToGender = {} idToUrbanLevel = {} with open(os.path.join(args.dataset_dir, ground_truth_file), "r") as gt_file: gt_file.next(); for line in gt_file: try: uid, gender, urbanLevel = line.split('\t') idToGender[uid] = gender if urbanLevel != "\r\n": idToUrbanLevel[uid] = int(urbanLevel) except: print line idToLoc = {} with gzip.open(os.path.join(args.dataset_dir, ground_truth_locs), "r") as gt_file: gt_file.next() for line in gt_file: uid, lat, lon = line.split('\t') idToLoc[uid] = (lat, lon) # Initialize the output streams. Rather than keeping things in memory, # we batch the gold standard posts by users (one at a time) and then # stream the user's gold standard posts (if any) to the output streams output_user_ids_file_handles = [] cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w') for i in range(0, num_folds): fold_name = "fold_%d" % i # All the IDs of the users with gold posts are written here fold_users_ids_fh = open(os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w') output_user_ids_file_handles.append(fold_users_ids_file_handles) cf_info_fh.write("%s\t%s.user-ids.txt" % (fold_name, fold_name)) cf_info_fh.close() # Load the dataset ds = SparseDataset(args.dataset_dir) if args.test_case == "gender": female_users = [] male_users = [] unknown_users = [] for user in ds.user_iter(): user_id = user['user_id'] usergender = idToGender.get(str(user_id), -1) # If this user had any gold locations, add them as folds if usergender != -1: #determine fold to use if userGender == "m": male_users.append(user_id) elif userGender == "f": female_users.append(user_id) else: unknown_users.append(user_id) currentFold = 0 male_folds = generate_folds(male_users, NUM_MALE_FOLDS) for fold in male_folds: write_fold(fold, currentFold, idToLoc, output_user_ids_file_handles) currentFold += 1 female_folds = generate_folds(female_users, NUM_FEMALE_FOLDS) for fold in female_folds: write_fold(fold, currentFold, idToLoc, output_user_ids_file_handles) currentFold += 1 unknown_folds = generate_folds(unknown_users, NUM_UNKNOWN_FOLDS) for fold in unknown_folds: write_fold(fold, currentFold, idToLoc, output_user_ids_file_handles) currentFold += 1 elif args.test_case == "county": usersAtLevel = [] for i in range(1, 7): usersAtLevel[i] = [] for user in ds.user_iter(): user_id = user['user_id'] urbanRuralLevel = idToUrbanLevel.get(str(user,id) -1) # If this user had any gold locations, add them as folds if urbanRuralLevel != -1: usersAtLevel[urbanRuralLevel].append(user_id) currentFoldIndex = 0 for i in range(1,7): currentFolds = generate_folds(usersAtLevel[i], NUM_URBAN_FOLDS) for fold in currentFolds: write_fold(fold, currentFoldIndex, idToLoc, output_users_ids_file_handles) currentFoldIndex += 1 else: # Iterate over the dataset looking for posts with geo IDs that we can # use as a gold standard for user in ds.user_iter(): gold_users = [] user_id = user['user_id'] gender = idToGender.get(str(user_id), -1) # If this user had any gold locations, add them as folds if gender != -1: gold_users.append(uid) currentFoldIndex = 0 currentFolds = generate_folds(gold_users, NUM_RANDOM_FOLDS) for fold in currentFolds: write_fold(fold, currentFoldIndex, idToLoc, output_users_ids_file_handles) currentFoldIndex += 1 for fh in output_user_ids_file_handles: fh.close()