コード例 #1
0
    def pre_processing(self):
        print 'preprocessing...'

        # uniformization
        raw = numpy.concatenate([
            self.train_set.sparse_matrix.data,
            self.valid_set.sparse_matrix.data, self.test_set.sparse_matrix.data
        ])

        len_train = len(self.train_set.sparse_matrix.data)
        len_valid = len(self.valid_set.sparse_matrix.data)
        len_test = len(self.test_set.sparse_matrix.data)

        out = data_processing.uniformization(raw, False)

        self.train_set.sparse_matrix.data = raw[0:len_train]
        self.valid_set.sparse_matrix.data = raw[len_train:(len_train +
                                                           len_valid)]
        self.test_set.sparse_matrix.data = raw[-len_test:]

        self.full_train = scipy.sparse.vstack(
            [self.train_set.sparse_matrix, self.valid_set.sparse_matrix],
            'csr')

        # shuffling train set
        self.full_train = self.full_train[
            numpy.random.permutation(self.full_train.shape[0]), :]

        # feature subset selection
        self.full_train = self.full_train[:, self.features_selected]
        self.valid_set = self.valid_set.sparse_matrix[:,
                                                      self.features_selected]
        self.test_set = self.test_set.sparse_matrix[:, self.features_selected]

        # whitening
        std = numpy.std(self.full_train.data)
        self.full_train /= std
        self.valid_set /= std
        self.test_set /= std

        # finally
        self.trainset = SparseDataset(
            from_scipy_sparse_dataset=self.full_train)
        self.validset = SparseDataset(from_scipy_sparse_dataset=self.valid_set)
        self.testset = SparseDataset(from_scipy_sparse_dataset=self.test_set)
コード例 #2
0
    def torch_loader(dataset,
                     data_path,
                     batch_size,
                     shuffle=True,
                     cuda_device=None,
                     num_workers=1):
        (train_data, val_data), (train_labels,
                                 val_labels), label_names = load_data_func(
                                     dataset, data_path)

        kwargs = {
            'num_workers': num_workers,
            'pin_memory': True
        } if cuda_device is not None else {}
        kwargs['drop_last'] = True

        if type(train_data) == numpy.ndarray:
            train_dataset = TensorDataset(torch.from_numpy(train_data),
                                          torch.from_numpy(train_labels))
            val_dataset = TensorDataset(torch.from_numpy(val_data),
                                        torch.from_numpy(val_labels))
        elif type(train_data) == scipy.sparse.csr.csr_matrix:
            from sklearn.feature_extraction.text import TfidfTransformer
            tfidf_trans = TfidfTransformer(norm=None)
            tfidf_trans.fit(train_data)
            train_dataset = SparseDataset(train_data, tfidf_trans.idf_)
            val_dataset = SparseDataset(val_data, tfidf_trans.idf_)
        else:
            train_dataset = torchvision.datasets.ImageFolder(train_data)
            val_dataset = torchvision.datasets.ImageFolder(val_data)

        train_loader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=shuffle,
                                  **kwargs)
        val_loader = DataLoader(val_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                **kwargs)

        return train_loader, val_loader, label_names
コード例 #3
0
    def __init__(self):

        self.trainset_path = '/data/lisa/data/UTLC/sparse/terry_train.npy.gz'
        self.validset_path = '/data/lisa/data/UTLC/sparse/terry_valid.npy.gz'
        self.testset_path = '/data/lisa/data/UTLC/sparse/terry_test.npy.gz'
        self.use_features_path = '/data/lisa/data/UTLC/sparse/terry_testvalid_activefeat.npy'

        self.features_selected = numpy.load(open(self.use_features_path))
        # these are sets before preprocessing
        self.train_set = SparseDataset(load_path=self.trainset_path)
        self.valid_set = SparseDataset(load_path=self.validset_path)
        self.test_set = SparseDataset(load_path=self.validset_path)
        # these are sets after preprocessing
        self.trainset = None
        self.validset = None
        self.testset = None
        #fullset = scipy.sparse.vstack((scipy.sparse.vstack((self.train_set.data, self.valid_set.data)),
        #                              self.test_set.data))
        #self.full_set = SparseDataset(from_sparse_dataset=fullset)

        self.pre_processing()
コード例 #4
0
ファイル: app.py プロジェクト: brentwalther/geoinference
def create_folds(args): 
	parser = argparse.ArgumentParser(prog='geoinf create_folds', description='creates a set of data partitions for evaluating with cross-fold validation')
	parser.add_argument('-f', '--force', help='overwrite the output model directory if it already exists')
	parser.add_argument('dataset_dir', help='a directory containing a geoinference dataset')
	parser.add_argument('num_folds', help='the number of folds into which the dataset should be divided')
	parser.add_argument('fold_dir', help='a (non-existent) directory that will contain the information on the cross-validation folds')

	args = parser.parse_args(args)

	# Confirm that the output directory doesn't exist
	if not os.path.exists(args.fold_dir): #and not args.force:
		#raise Exception, 'output fold_dir cannot already exist'
                os.mkdir(args.fold_dir)

	# Decide on the number of folds
	num_folds = int(args.num_folds)
	if num_folds <= 1:
		raise Exception, 'The number of folds must be at least two'

        # Initialize the output streams.  Rather than keeping things in memory,
        # we batch the gold standard posts by users (one at a time) and then
        # stream the user's gold standard posts (if any) to the output streams
        output_held_out_post_ids_file_handles = []
        output_held_out_user_ids_file_handles = []
        output_gold_loc_file_handles = []
        output_posts_file_handles = []
	cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w')

	for i in range(0, num_folds):
		fold_name = "fold_%d" % i
                # All the IDs of the gold posts in this fold are written here
		fold_posts_ids_fh = open(os.path.join(args.fold_dir, fold_name + ".post-ids.txt"), 'w')
                output_held_out_post_ids_file_handles.append(fold_posts_ids_fh)

                # All the IDs of the users with gold posts are written here
		fold_users_ids_fh = open(os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w')
                output_held_out_user_ids_file_handles.append(fold_users_ids_fh)

                # All the lat/lon and IDs of the gold posts are written here
		gold_loc_fh = open(os.path.join(args.fold_dir, fold_name + ".gold-locations.tsv"), 'w')
                output_gold_loc_file_handles.append(gold_loc_fh)

                # The users.json.gz file with the gold data (used for testing)
		gold_loc_fh = gzip.open(os.path.join(args.fold_dir, fold_name + ".users.json.gz"), 'w')
                output_posts_file_handles.append(gold_loc_fh)
                cf_info_fh.write("%s\t%s.post-ids.txt\t%s.user-ids.txt\t%s.users.json.gz\n" 
                                 % (fold_name, fold_name, fold_name, fold_name))
        cf_info_fh.close()

	# Load the dataset
	ds = SparseDataset(args.dataset_dir)

	logger.debug('Extracting gold-standard posts')
	num_users = 0
        num_posts = 0
        num_gold_users = 0
        num_gold_posts = 0

	# Iterate over the dataset looking for posts with geo IDs that we can
	# use as a gold standard
	for user in ds.user_iter():
                gold_posts = []
                gold_post_id_to_loc = {}
                user_id = user['user_id']
                num_posts += len(user['posts'])
                for post in user['posts']:
                        if "geo" in post:
                                post_id = post['id']
                                loc = post['geo']['coordinates']
                                gold_post_id_to_loc[post_id] = loc
                                gold_posts.append(post)
                # If this user had any gold locations, add them as folds
                if len(gold_posts) > 0:
                        num_gold_posts += len(gold_posts)
                        fold_to_use = num_gold_users % num_folds
                        num_gold_users += 1
                        
                        output_held_out_user_ids_file_handles[fold_to_use].write("%s\n" % user['user_id'])

                        for post_id, loc in gold_post_id_to_loc.iteritems():
                                output_held_out_post_ids_file_handles[fold_to_use].write("%d\n" % post_id)
                                output_gold_loc_file_handles[fold_to_use].write("%d\t%s\t%f\t%f\n" % (post_id, user_id, loc[0], loc[1]))
                        # Lazily mutate the existing user object and the dump
                        # that object to the fold's user.json.gz 
                        user['posts'] = gold_posts
                        output_posts_file_handles[fold_to_use].write("%s\n" % simplejson.dumps(user))
                        
                num_users += 1
		if num_users % 100000 == 0:
			logger.debug('Processed %d users, saw %d gold so far (%d posts of %d (%f))' 
                                     % (num_users, num_gold_users, num_gold_posts, num_posts,
                                        float(num_gold_posts) / num_posts))

        for fh in output_posts_file_handles:
                fh.close()
        for fh in output_held_out_post_ids_file_handles:
                fh.close()
        for fh in output_held_out_user_ids_file_handles:
                fh.close()
        for fh in output_gold_loc_file_handles:
                fh.close()

	logger.debug('Saw %d gold standard users in %d total' % (num_gold_users, num_users))
コード例 #5
0
ファイル: app.py プロジェクト: brentwalther/geoinference
def infer(args,by_user=False):
	prog_name = 'geoinf'
	if by_user:
		description='infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method grouped by user.'
		prog_name += ' infer_by_user'
	else:
		description='infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method one at a time.'
		prog_name += ' infer_by_post'

	parser = argparse.ArgumentParser(prog=prog_name,description=description)
	parser.add_argument('-f','--force',action='store_true',help='overwrite the output file if it already exists')
	parser.add_argument('-s','--settings',help='a json file of settings to be passed to the model',nargs=1)
	parser.add_argument('method_name',help='the type of method to use for inference')
	parser.add_argument('model_dir',help='the directory of a model that was constructed using the train procedure')
	parser.add_argument('dataset',help='a json specification for the dataset to infer locations on')
	parser.add_argument('infer_file',help='the file that the inferences will be written to')
		
	logger.debug('infer args = %s' % str(args))
	args = parser.parse_args(args)

	# load the infer settings if necessary
	settings = {}
	if args.settings:
		with open(args.settings,'r') as fh:
			settings = json.load(fh)
	
	if os.path.exists(args.infer_file) and not args.force:
		raise Exception, 'output infer_file cannot exist'

	# load the method
	method = get_method_by_name(args.method_name)
	method_inst = method()
	model = method_inst.load_model(args.model_dir,settings)

	# load the dataset
	ds = SparseDataset(args.dataset)

	# get the output file ready
	outfh = open(args.infer_file,'w')

	# write settings to the first line
	outfh.write('%s\n' % json.dumps({'method': args.method_name, 
									 'settings': settings, 
									 'dataset': args.dataset,
									 'by_user': by_user}))
	
	# locate all the posts
	logger.info('inferring locations for posts')	
	if by_user:
                num_posts_seen = 0
                num_posts_located = 0
                num_users_seen = 0
		for user in ds.user_iter():
			user_id = user['user_id']
			posts = user['posts']

			locs = model.infer_posts_locations_by_user(user_id,posts)

			assert len(locs) == len(posts)
                        num_users_seen += 1

			for loc,post in zip(locs,posts):
                                num_posts_seen += 1
				if not loc is None:
                                        num_posts_located += 1
					outfh.write('%s\t%f\t%f\n' % (post['id'],loc[0],loc[1]))

                                if num_posts_seen % 10000 == 0:
                                        logger.debug("Saw %d users, %d posts, %d of which were located" % (num_users_seen, num_posts_seen, num_posts_located))
	else:
                num_posts_seen = 0
                num_posts_located = 0
		for post in ds.post_iter():
                        user_id = post['user']['id_str']
			loc = model.infer_post_location(post)
                        num_posts_seen += 1
			if not loc is None:
                                outfh.write('%s\t%f\t%f\n' % (post['id'],loc[0],loc[1]))
                                num_posts_located += 1
                        if num_posts_seen % 10000 == 0:
                                logger.debug("Saw %d posts, %d of which were located" % (num_posts_seen, num_posts_located))

	outfh.close()
コード例 #6
0
def create_folds(args):
    parser = argparse.ArgumentParser(
        prog='geoinf create_folds',
        description=
        'creates a set of data partitions for evaluating with cross-fold validation'
    )
    parser.add_argument(
        '-f',
        '--force',
        help='overwrite the output model directory if it already exists')
    parser.add_argument('dataset_dir',
                        help='a directory containing a geoinference dataset')
    parser.add_argument(
        'num_folds',
        help='the number of folds into which the dataset should be divided')
    parser.add_argument(
        'fold_dir',
        help=
        'a (non-existent) directory that will contain the information on the cross-validation folds'
    )

    args = parser.parse_args(args)

    # Confirm that the output directory doesn't exist
    if not os.path.exists(args.fold_dir):  #and not args.force:
        #raise Exception, 'output fold_dir cannot already exist'
        os.mkdir(args.fold_dir)

    # Decide on the number of folds
    num_folds = int(args.num_folds)
    if num_folds <= 1:
        raise Exception, 'The number of folds must be at least two'

# Initialize the output streams.  Rather than keeping things in memory,
# we batch the gold standard posts by users (one at a time) and then
# stream the user's gold standard posts (if any) to the output streams
    output_held_out_post_ids_file_handles = []
    output_held_out_user_ids_file_handles = []
    output_gold_loc_file_handles = []
    output_posts_file_handles = []
    cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w')

    for i in range(0, num_folds):
        fold_name = "fold_%d" % i
        # All the IDs of the gold posts in this fold are written here
        fold_posts_ids_fh = open(
            os.path.join(args.fold_dir, fold_name + ".post-ids.txt"), 'w')
        output_held_out_post_ids_file_handles.append(fold_posts_ids_fh)

        # All the IDs of the users with gold posts are written here
        fold_users_ids_fh = open(
            os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w')
        output_held_out_user_ids_file_handles.append(fold_users_ids_fh)

        # All the lat/lon and IDs of the gold posts are written here
        gold_loc_fh = open(
            os.path.join(args.fold_dir, fold_name + ".gold-locations.tsv"),
            'w')
        output_gold_loc_file_handles.append(gold_loc_fh)

        # The users.json.gz file with the gold data (used for testing)
        gold_loc_fh = gzip.open(
            os.path.join(args.fold_dir, fold_name + ".users.json.gz"), 'w')
        output_posts_file_handles.append(gold_loc_fh)
        cf_info_fh.write(
            "%s\t%s.post-ids.txt\t%s.user-ids.txt\t%s.users.json.gz\n" %
            (fold_name, fold_name, fold_name, fold_name))
    cf_info_fh.close()

    # Load the dataset
    ds = SparseDataset(args.dataset_dir)

    logger.debug('Extracting gold-standard posts')
    num_users = 0
    num_posts = 0
    num_gold_users = 0
    num_gold_posts = 0

    # Iterate over the dataset looking for posts with geo IDs that we can
    # use as a gold standard
    for user in ds.user_iter():
        gold_posts = []
        gold_post_id_to_loc = {}
        user_id = user['user_id']
        num_posts += len(user['posts'])
        for post in user['posts']:
            if "geo" in post:
                post_id = post['id']
                loc = post['geo']['coordinates']
                gold_post_id_to_loc[post_id] = loc
                gold_posts.append(post)
        # If this user had any gold locations, add them as folds
        if len(gold_posts) > 0:
            num_gold_posts += len(gold_posts)
            fold_to_use = num_gold_users % num_folds
            num_gold_users += 1

            output_held_out_user_ids_file_handles[fold_to_use].write(
                "%s\n" % user['user_id'])

            for post_id, loc in gold_post_id_to_loc.iteritems():
                output_held_out_post_ids_file_handles[fold_to_use].write(
                    "%d\n" % post_id)
                output_gold_loc_file_handles[fold_to_use].write(
                    "%d\t%s\t%f\t%f\n" % (post_id, user_id, loc[0], loc[1]))
            # Lazily mutate the existing user object and the dump
            # that object to the fold's user.json.gz
            user['posts'] = gold_posts
            output_posts_file_handles[fold_to_use].write(
                "%s\n" % simplejson.dumps(user))

        num_users += 1
        if num_users % 100000 == 0:
            logger.debug(
                'Processed %d users, saw %d gold so far (%d posts of %d (%f))'
                % (num_users, num_gold_users, num_gold_posts, num_posts,
                   float(num_gold_posts) / num_posts))

    for fh in output_posts_file_handles:
        fh.close()
    for fh in output_held_out_post_ids_file_handles:
        fh.close()
    for fh in output_held_out_user_ids_file_handles:
        fh.close()
    for fh in output_gold_loc_file_handles:
        fh.close()

    logger.debug('Saw %d gold standard users in %d total' %
                 (num_gold_users, num_users))
コード例 #7
0
def infer(args, by_user=False):
    prog_name = 'geoinf'
    if by_user:
        description = 'infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method grouped by user.'
        prog_name += ' infer_by_user'
    else:
        description = 'infer the location of posts in a dataset using a specific inference method. Posts will be provided to the method one at a time.'
        prog_name += ' infer_by_post'

    parser = argparse.ArgumentParser(prog=prog_name, description=description)
    parser.add_argument('-f',
                        '--force',
                        action='store_true',
                        help='overwrite the output file if it already exists')
    parser.add_argument(
        '-s',
        '--settings',
        help='a json file of settings to be passed to the model',
        nargs=1)
    parser.add_argument('method_name',
                        help='the type of method to use for inference')
    parser.add_argument(
        'model_dir',
        help=
        'the directory of a model that was constructed using the train procedure'
    )
    parser.add_argument(
        'dataset',
        help='a json specification for the dataset to infer locations on')
    parser.add_argument('infer_file',
                        help='the file that the inferences will be written to')

    logger.debug('infer args = %s' % str(args))
    args = parser.parse_args(args)

    # load the infer settings if necessary
    settings = {}
    if args.settings:
        with open(args.settings, 'r') as fh:
            settings = json.load(fh)

    if os.path.exists(args.infer_file) and not args.force:
        raise Exception, 'output infer_file cannot exist'

    # load the method
    method = get_method_by_name(args.method_name)
    method_inst = method()
    model = method_inst.load_model(args.model_dir, settings)

    # load the dataset
    ds = SparseDataset(args.dataset)

    # get the output file ready
    outfh = open(args.infer_file, 'w')

    # write settings to the first line
    outfh.write('%s\n' % json.dumps({
        'method': args.method_name,
        'settings': settings,
        'dataset': args.dataset,
        'by_user': by_user
    }))

    # locate all the posts
    logger.info('inferring locations for posts')
    if by_user:
        num_posts_seen = 0
        num_posts_located = 0
        num_users_seen = 0
        for user in ds.user_iter():
            user_id = user['user_id']
            posts = user['posts']

            locs = model.infer_posts_locations_by_user(user_id, posts)

            assert len(locs) == len(posts)
            num_users_seen += 1

            for loc, post in zip(locs, posts):
                num_posts_seen += 1
                if not loc is None:
                    num_posts_located += 1
                    outfh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1]))

                if num_posts_seen % 10000 == 0:
                    logger.debug(
                        "Saw %d users, %d posts, %d of which were located" %
                        (num_users_seen, num_posts_seen, num_posts_located))
    else:
        num_posts_seen = 0
        num_posts_located = 0
        for post in ds.post_iter():
            user_id = post['user']['id_str']
            loc = model.infer_post_location(post)
            num_posts_seen += 1
            if not loc is None:
                outfh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1]))
                num_posts_located += 1
            if num_posts_seen % 10000 == 0:
                logger.debug("Saw %d posts, %d of which were located" %
                             (num_posts_seen, num_posts_located))

    outfh.close()
コード例 #8
0
def train(args):
    parser = argparse.ArgumentParser(
        prog='geoinf train',
        description='train a geoinference method on a specific dataset')
    parser.add_argument(
        '-f',
        '--force',
        help='overwrite the output model directory if it already exists')
    parser.add_argument('method_name', help='the method to use')
    parser.add_argument(
        'method_settings',
        help='a json file containing method-specific configurations')
    parser.add_argument('dataset_dir',
                        help='a directory containing a geoinference dataset')
    parser.add_argument(
        'model_dir',
        help='a (non-existing) directory where the trained model will be stored'
    )
    parser.add_argument('--location-source',
                        nargs=1,
                        help='specifies the source of ground-truth locations')

    args = parser.parse_args(args)

    # confirm that the output directory doesn't exist
    if os.path.exists(args.model_dir) and not args.force:
        raise Exception, 'output model_dir cannot exist'

    # load the method
    method = get_method_by_name(args.method_name)

    # load the data
    with open(args.method_settings, 'r') as fh:
        settings = json.load(fh)

    location_source = args.location_source
    if location_source:
        location_source = location_source[0]
        logger.debug('Using %s as the source of ground truth location' %
                     location_source)
        settings['location_source'] = location_source

    # load the dataset
    ds = None  #Dataset(args.dataset_dir)
    if not location_source is None:
        ds = SparseDataset(args.dataset_dir,
                           default_location_source=location_source)
    else:
        ds = SparseDataset(args.dataset_dir)

    # load the method
    method = get_method_by_name(args.method_name)
    method_inst = method()

    start_time = time.time()
    method_inst.train_model(settings, ds, args.model_dir)
    end_time = time.time()
    logger.info('Trained model %s on dataset %s in %f seconds' %
                (args.method_name, args.dataset_dir, end_time - start_time))

    # drop some metadata into the run method
    # run the method
    # gi_inst = method()
    # gi_inst.train(settings,ds,args.model_dir)

    return
コード例 #9
0
def cross_validate(args):
    parser = argparse.ArgumentParser(
        prog='geoinf cross_validate',
        description='evaluate a geocinference method using cross-validation')
    parser.add_argument(
        '-f',
        '--force',
        help='overwrite the output model directory if it already exists')
    parser.add_argument('method_name', help='the method to use')
    parser.add_argument(
        'method_settings',
        help='a json file containing method-specific configurations')
    parser.add_argument('dataset_dir',
                        help='a directory containing a geoinference dataset')
    parser.add_argument(
        'fold_dir',
        help=
        'the name of the directory containing information on the cross-validation folds'
    )
    parser.add_argument(
        'results_dir',
        help=
        'a (non-existent) directory where the evaluation results will be stored'
    )
    parser.add_argument('--fold',
                        nargs=1,
                        help='runs just that fold from the cross-fold dataset')
    parser.add_argument('--location-source',
                        nargs=1,
                        help='specifies the source of ground-truth locations')

    args = parser.parse_args(args)

    # confirm that the output directory doesn't exist
    #	if os.path.exists(args.results_dir) and not args.force:
    #		raise Exception, 'output results_dir cannot already exist'

    if not os.path.exists(args.results_dir):  #and not args.force:
        #raise Exception, 'output fold_dir cannot already exist'
        os.mkdir(args.results_dir)

    # load the method
    method = get_method_by_name(args.method_name)

    # load the data
    with open(args.method_settings, 'r') as fh:
        settings = json.load(fh)

    specific_fold_to_run = args.fold
    if specific_fold_to_run:
        specific_fold_to_run = specific_fold_to_run[0]
    location_source = args.location_source
    if location_source:
        logger.debug('Using %s as the source of ground truth location' %
                     location_source)
        location_source = location_source[0]
        settings['location_source'] = location_source

    print "running fold %s" % (specific_fold_to_run)

    # Load the folds to be used in the dataset
    cfv_fh = open(os.path.join(args.fold_dir, 'folds.info.tsv'))

    # Each line contains two files specifying the post IDs to be held out
    # from the full dataset (for that fold) and the corresponding file in
    # the fold_dir containing the testing data for that fold
    for line in cfv_fh:
        line = line.strip()
        fold_name, testing_post_ids_file, testing_user_ids_file, testing_users_file = line.split(
            "\t")

        # Skip this fold if the user has told us to run only one fold by name
        if specific_fold_to_run is not None and fold_name != specific_fold_to_run:
            continue

        logger.debug('starting processing of fold %s' % fold_name)

        # Read in the post IDs to exclude
        testing_post_ids = set()
        tpi_fh = open(
            os.path.join(args.fold_dir,
                         testing_post_ids_file.replace('held-out-', '')))
        for id_str in tpi_fh:
            testing_post_ids.add(id_str.strip())
        tpi_fh.close()

        # Read in the user IDs to exclude
        testing_user_ids = set()
        tpi_fh = open(
            os.path.join(args.fold_dir,
                         testing_user_ids_file.replace('held-out-', '')))
        for id_str in tpi_fh:
            testing_user_ids.add(id_str.strip())
        tpi_fh.close()

        logger.debug('Loaded %d users whose location data will be held out' %
                     len(testing_user_ids))

        # load the dataset
        training_data = None
        if not location_source is None:
            training_data = SparseDataset(
                args.dataset_dir,
                excluded_users=testing_user_ids,
                default_location_source=location_source)
        else:
            training_data = SparseDataset(args.dataset_dir,
                                          excluded_users=testing_user_ids)

    # load the method
        method = get_method_by_name(args.method_name)
        method_inst = method()

        # Create the temporary directory that will hold the model for
        # this fold
        model_dir = os.path.join(args.results_dir, fold_name)
        if not os.path.exists(model_dir):
            os.mkdir(model_dir)

    # Train on the datset, holding out the testing post IDs
        model = method_inst.train_model(settings, training_data, None)

        logger.debug('Finished training during fold %s; beginning testing' %
                     fold_name)

        logger.debug("Reading testing data from %s" %
                     (os.path.join(args.fold_dir, testing_users_file)))

        testing_data = Dataset(args.fold_dir,
                               users_file=os.path.join(args.fold_dir,
                                                       testing_users_file))

        logger.debug(
            "Writing results to %s" %
            (os.path.join(args.results_dir, fold_name + ".results.tsv.gz")))

        out_fh = gzip.open(
            os.path.join(args.results_dir, fold_name + ".results.tsv.gz"), 'w')

        num_tested_users = 0
        num_tested_posts = 0
        seen_ids = set()
        for user in testing_data.user_iter():
            user_id = user['user_id']
            posts = user['posts']

            locs = model.infer_posts_locations_by_user(user_id, posts)

            if len(locs) != len(posts):
                print "#WUT %d != %d" % (len(locs), len(posts))

            num_located_posts = 0
            num_tested_posts += len(posts)
            for loc, post in zip(locs, posts):
                pid = post['id']
                if pid in seen_ids:
                    continue
                seen_ids.add(pid)
                if not loc is None:
                    out_fh.write('%s\t%f\t%f\n' % (post['id'], loc[0], loc[1]))
                    num_located_posts += 1
            num_tested_users += 1
            if num_tested_users % 10000 == 0:
                logger.debug(
                    'During testing of fold %s, processed %d users, %d posts, %d located'
                    % (fold_name, num_tested_users, num_tested_posts,
                       num_located_posts))

        out_fh.close()
        logger.debug('Finished testing of fold %s' % fold_name)
コード例 #10
0
ファイル: app.py プロジェクト: ConnorMcMahon/geoinference
def create_folds(args): 
    parser = argparse.ArgumentParser(prog='geoinf create_folds', description='creates a set of data partitions for evaluating with cross-fold validation')
    parser.add_argument('-f', '--force', help='overwrite the output model directory if it already exists')
    parser.add_argument('dataset_dir', help='a directory containing a geoinference dataset')
    parser.add_argument('fold_dir', help='a (non-existent) directory that will contain the information on the cross-validation folds')
    parser.add_argument('test_case', help="What type of test wanted to run i.e. rural vs urban (county), gender (gender), or random (any other string)")

    args = parser.parse_args(args)

    # Confirm that the output directory doesn't exist
    if not os.path.exists(args.fold_dir): #and not args.force:
        #raise Exception, 'output fold_dir cannot already exist'
                os.mkdir(args.fold_dir)
    ground_truth_file = "filtered_user_groundtruth_locfield.tsv"
    ground_truth_locs = "users.home-locations.loc-field.tsv.gz"

    # Decide on the number of folds
    if num_folds <= 1:
        #raise Exception, 'The number of folds must be at least two'
        print("the number of folds must be at least two")

    if args.test_case == "gender":
        num_folds = NUM_MALE_FOLDS + NUM_FEMALE_FOLDS + NUM_UNKNOWN_FOLDS
    elif args.test_case == "county":
        num_folds = NUM_URBAN_FOLDS * 6
    else:
        num_folds = NUM_RANDOM_FOLDS

    idToGender = {}
    idToUrbanLevel = {}
    with open(os.path.join(args.dataset_dir, ground_truth_file), "r") as gt_file:
        gt_file.next();
        for line in gt_file:
            try:
                uid, gender, urbanLevel = line.split('\t')
                idToGender[uid] = gender
                if urbanLevel != "\r\n":
                    idToUrbanLevel[uid] = int(urbanLevel)
            except:
                print line

    idToLoc = {}
    with gzip.open(os.path.join(args.dataset_dir, ground_truth_locs), "r") as gt_file:
        gt_file.next()
        for line in gt_file:
            uid, lat, lon = line.split('\t')
            idToLoc[uid] = (lat, lon)

    # Initialize the output streams.  Rather than keeping things in memory,
    # we batch the gold standard posts by users (one at a time) and then
    # stream the user's gold standard posts (if any) to the output streams
    output_user_ids_file_handles = []
    cf_info_fh = open(os.path.join(args.fold_dir, "folds.info.tsv"), 'w')

    for i in range(0, num_folds):
        fold_name = "fold_%d" % i

        # All the IDs of the users with gold posts are written here
        fold_users_ids_fh = open(os.path.join(args.fold_dir, fold_name + ".user-ids.txt"), 'w')
        output_user_ids_file_handles.append(fold_users_ids_file_handles)

        cf_info_fh.write("%s\t%s.user-ids.txt" 
                                 % (fold_name, fold_name))
    cf_info_fh.close()

    # Load the dataset
    ds = SparseDataset(args.dataset_dir)

    if args.test_case == "gender":
        female_users = []
        male_users = []
        unknown_users = []

        for user in ds.user_iter():                             
            user_id = user['user_id']
            usergender = idToGender.get(str(user_id), -1)
            # If this user had any gold locations, add them as folds
            if usergender != -1:
                #determine fold to use
                if userGender == "m":
                    male_users.append(user_id)
                elif userGender == "f":
                    female_users.append(user_id)
                else:
                    unknown_users.append(user_id)
        currentFold = 0

        male_folds = generate_folds(male_users, NUM_MALE_FOLDS)
        for fold in male_folds:
            write_fold(fold, currentFold, idToLoc, output_user_ids_file_handles)
            currentFold += 1
        
        female_folds = generate_folds(female_users, NUM_FEMALE_FOLDS)
        for fold in female_folds:
            write_fold(fold, currentFold, idToLoc, output_user_ids_file_handles)
            currentFold += 1
        
        unknown_folds = generate_folds(unknown_users, NUM_UNKNOWN_FOLDS)
        for fold in unknown_folds:
            write_fold(fold, currentFold, idToLoc, output_user_ids_file_handles)
            currentFold += 1
    elif args.test_case == "county":
        usersAtLevel = []
        for i in range(1, 7):
            usersAtLevel[i] = []
        for user in ds.user_iter():
            user_id = user['user_id']
            urbanRuralLevel = idToUrbanLevel.get(str(user,id) -1)
            # If this user had any gold locations, add them as folds
            if urbanRuralLevel != -1:
                usersAtLevel[urbanRuralLevel].append(user_id)
        currentFoldIndex = 0
        for i in range(1,7):        
            currentFolds = generate_folds(usersAtLevel[i], NUM_URBAN_FOLDS)
            for fold in currentFolds:
                write_fold(fold, currentFoldIndex, idToLoc, output_users_ids_file_handles)
                currentFoldIndex += 1
    else:
        # Iterate over the dataset looking for posts with geo IDs that we can
        # use as a gold standard
        for user in ds.user_iter():
            gold_users = []
            user_id = user['user_id']
            gender = idToGender.get(str(user_id), -1)
            # If this user had any gold locations, add them as folds
            if gender != -1:
                gold_users.append(uid)
                
        currentFoldIndex = 0
        currentFolds = generate_folds(gold_users, NUM_RANDOM_FOLDS)
        for fold in currentFolds:
            write_fold(fold, currentFoldIndex, idToLoc, output_users_ids_file_handles)
            currentFoldIndex += 1
            
    for fh in output_user_ids_file_handles:
        fh.close()