# drop Nones in case user forgot to delete '-'
    if type(cfg['sample']) == type([]):
        if None in cfg['sample']: cfg['sample'].remove(None)

    # iterate over the time windows we're interested in - from yaml
    df = pd.DataFrame()
    for train_start in splitdates:

        split_date = train_start + pd.DateOffset(
            months=cfg['train_period_months'])

        # create dataloader, get the rows (training is enough)
        dload = DataLoader(target=cfg['target'],
                           feature_list=cfg['features'],
                           restrictors=cfg['sample'],
                           split_date=str(split_date.date()),
                           train_start=str(train_start.date()),
                           test_end=None,
                           schema=cfg['schema'])

        if len(dload.target) > 1:
            raise Exception(
                "The dataloader has more than one target variable?!")

        target_str = dload.target[0][
            'train'].feature_col  # overwritten in each loop, doesn't matter

        thisdf = dload.train_rows.drop(['studentid', 'collegeid'], axis=1)
        thisdf['date'] = str(train_start.date()) + ':\n' + str(
            split_date.date())
        df = df.append(thisdf)
    tmp = pd.DataFrame(columns=['AUC', 'AUC_train', 'features'])
    # rowIdx = 0

    # loop over the dates
    for train_start in splitdates:

        split_date = train_start + pd.DateOffset(
            months=cfg['train_period_months'])
        test_end = split_date + pd.DateOffset(months=cfg['test_period_months'])

        # get the data for this period
        dload = DataLoader(target=cfg['target'],
                           feature_list=allfeatures,
                           restrictors=cfg['sample'],
                           split_date=str(split_date.date()),
                           train_start=str(train_start.date()),
                           test_end=str(test_end.date()),
                           schema='common')

        # loop over models
        for model in cfg['models']:

            if type(model) == str:
                model = {model: {}}

            if (len(model.keys()) > 1) or (len(model.values()) > 1):
                raise IOError("A model is not specified correctly.")

            modelname = model.keys()[0]
            paramdict = model.values()[0]
Esempio n. 3
0
def get_df(configFile='code/visualizations/configs/util.yaml'):
    '''
	@description: A workaround if you want to pull data into a dataframe, for example
				  to run visualizations or ad-hoc analyses.
	@param configFile: Path to a YAML config file that specifies which data to pull.
					   See 'code/visualizations/configs/util.yaml' for an example.
	@return: df, feature_to_col: df is a dataframe with the data as specified
								 in the YAML file. Columns have been postprocessed, 
								 as specified in each feature.
								 feature_to_col is a dictionary that maps feature names
								 to columns names. This is useful to know which dummified
								 columns belong to which feature. (Doesn't work for Pandas features,
								 which just appear as individual feature:column pairs.)
	'''

    # get yaml for the data
    with open(os.path.join(PERSISTENCE_PATH, configFile), 'r') as f:
        cfg = yaml.load(f)

    feature_to_col = {}

    # create dataloader, get the rows (only training)
    dload = DataLoader(target=cfg['target'],
                       feature_list=cfg['features'],
                       restrictors=cfg['sample'],
                       train_start=str(cfg['start_date']),
                       split_date=str(cfg['end_date']),
                       test_end=None,
                       schema=cfg['schema'])

    if len(dload.target) > 1:
        raise Exception("The dataloader has more than one target variable?!")

    target_str = dload.target[0]['train'].feature_col
    thisdf = dload.train_rows.copy()

    # now apply the postprocessors
    pps = dload.get_postprocessors(cfg['features'])

    # apply the postprocessors, remember if the feature maps to several columns
    # TODO: this probably doesn't work if there are Pandas features involved...
    for key in pps.keys():

        # remember the columns we have right now
        old_cols = thisdf.columns.tolist()

        for postprocessor, kwargs in pps[key].items():
            thisdf = postprocessor(thisdf, columns=[key], **kwargs)

        # the postprocessors might have added columns; remember them
        added_cols = set(thisdf.columns.tolist()) - set(old_cols)

        # now keep the mapping from feature name to list of columns
        if added_cols:
            feature_to_col[key] = list(
                set(feature_to_col.get(key, []) + list(added_cols)))
            # always add the original feature if it still exists
            if key in thisdf.columns.tolist():
                feature_to_col[key] = list(set(feature_to_col[key] + [key]))
        else:
            feature_to_col[key] = list(
                set(feature_to_col.get(key, []) + [key]))

    # we don't need the foreign keys as training data
    thisdf = thisdf.drop(['studentid', 'collegeid'], axis=1)

    # give constant name to target column
    thisdf = thisdf.rename(columns={target_str: 'persist'})

    return thisdf, feature_to_col
Esempio n. 4
0
def get_df(configFile='code/visualizations/configs/util.yaml'):
	'''
	@description: A workaround if you want to pull data into a dataframe, for example
				  to run visualizations or ad-hoc analyses.
	@param configFile: Path to a YAML config file that specifies which data to pull.
					   See 'code/visualizations/configs/util.yaml' for an example.
	@return: df, feature_to_col: df is a dataframe with the data as specified
								 in the YAML file. Columns have been postprocessed, 
								 as specified in each feature.
								 feature_to_col is a dictionary that maps feature names
								 to columns names. This is useful to know which dummified
								 columns belong to which feature. (Doesn't work for Pandas features,
								 which just appear as individual feature:column pairs.)
	'''


	# get yaml for the data
	with open(os.path.join(PERSISTENCE_PATH, configFile), 'r') as f:
			cfg = yaml.load(f)

	feature_to_col = {}

	# create dataloader, get the rows (only training)
	dload = DataLoader(target=cfg['target'], feature_list=cfg['features'],
					  restrictors=cfg['sample'],
					  train_start=str(cfg['start_date']),
					  split_date=str(cfg['end_date']),
					  test_end=None,
					  schema=cfg['schema'])

	if len(dload.target) > 1: raise Exception("The dataloader has more than one target variable?!")
		
	target_str = dload.target[0]['train'].feature_col
	thisdf = dload.train_rows.copy()

	# now apply the postprocessors
	pps = dload.get_postprocessors(cfg['features'])

	# apply the postprocessors, remember if the feature maps to several columns
	# TODO: this probably doesn't work if there are Pandas features involved...
	for key in pps.keys():

		# remember the columns we have right now
		old_cols = thisdf.columns.tolist()
		
		for postprocessor, kwargs in pps[key].items():
			thisdf = postprocessor(thisdf, columns = [key], **kwargs)
		
		# the postprocessors might have added columns; remember them
		added_cols = set(thisdf.columns.tolist()) - set(old_cols)

		# now keep the mapping from feature name to list of columns
		if added_cols:
			feature_to_col[key] = list(set(feature_to_col.get(key,[]) + list(added_cols)))
			# always add the original feature if it still exists
			if key in thisdf.columns.tolist():
				feature_to_col[key] = list(set(feature_to_col[key] + [key]))
		else:
			feature_to_col[key] = list(set(feature_to_col.get(key,[]) + [key]))

	# we don't need the foreign keys as training data
	thisdf = thisdf.drop(['studentid','collegeid'],axis=1)
	
	# give constant name to target column
	thisdf = thisdf.rename(columns={target_str:'persist'})

	return thisdf, feature_to_col
	# drop Nones in case user forgot to delete '-'
	if type(cfg['sample']) == type([]):
		if None in cfg['sample']: cfg['sample'].remove(None)


	# iterate over the time windows we're interested in - from yaml
	df = pd.DataFrame()
	for train_start in splitdates:

		split_date = train_start + pd.DateOffset(months=cfg['train_period_months'])

		# create dataloader, get the rows (training is enough)
		dload = DataLoader(target=cfg['target'], feature_list=cfg['features'],
						  restrictors=cfg['sample'],
						  split_date=str(split_date.date()),
						  train_start=str(train_start.date()),
						  test_end=None,
						  schema=cfg['schema'])

		if len(dload.target) > 1: raise Exception("The dataloader has more than one target variable?!")
		
		target_str = dload.target[0]['train'].feature_col # overwritten in each loop, doesn't matter	

		thisdf = dload.train_rows.drop(['studentid','collegeid'],axis=1)
		thisdf['date'] = str(train_start.date()) +':\n'+ str(split_date.date())
		df = df.append(thisdf)

	# Ah. If the time steps overlap, df might now contain duplicates. Gotta fix that.
	df.reset_index(inplace=True)
	df.drop_duplicates(inplace=True)
	df.set_index('enrollid',inplace=True)
                         'PersistSevenSemesters': '=True'
                     }, 'PersistEightSemesters', '7->8')]

    # iterate over the model pairs
    df = pd.DataFrame()
    feature_to_col = {}
    for pair in variablepairs:

        # get the restrictor for this semester (i.e., which semester transition we're fitting)
        thissemester = [pair[0]] if pair[0] != None else []

        # create dataloader, get the rows (training is enough)
        dload = DataLoader(target=pair[1],
                           feature_list=cfg['features'],
                           restrictors=cfg['sample'] + thissemester,
                           train_start=str(cfg['start_date']),
                           split_date=str(cfg['end_date']),
                           test_end=None,
                           schema=cfg['schema'])

        if len(dload.target) > 1:
            raise Exception(
                "The dataloader has more than one target variable?!")

        target_str = dload.target[0]['train'].feature_col
        thisdf = dload.train_rows.copy()

        # now apply the postprocessors - assuming here they're the same
        # for every data loader!
        pps = dload.get_postprocessors(cfg['features'])