def SelectModel(self, direction): predictors = self.predictors #Make a formula that includes all of the possible predictors names = self.data_dictionary.keys() names.remove(self.target) allvariables = utils.SanitizeVariableName(self.target) + "~" for i in range(len(names)): allvariables += utils.SanitizeVariableName(names[i]) + "+" allvariables = allvariables[:-1] #Use this saturated model as the maximum scope for variable selection via the BIC allvariables = r.Call('as.formula', obj=allvariables) self.logistic_params['formula'] = allvariables if direction: self.model = r.Call(function='step', object=self.model, direction=direction, scope=allvariables, k=np.log(self.nobs)).AsList() #Extract the variables that were selected for the model vars = r.Call(function='names', x=self.model['coefficients'].AsList()).AsVector() vars = [str(v) for v in vars][1:] #Make a formula that includes just the selected predictors, and add it to the list of model-building parameters formula = utils.SanitizeVariableName(self.target) + "~" for i in range(len(vars)): formula += vars[i] + "+" formula = formula[:-1] self.formula = r.Call('as.formula', obj=formula) self.logistic_params['formula'] = self.formula self.model = r.Call(function='glm', **self.logistic_params).AsList()
def Deserialize(self, model_struct): #Unpack the model_struct dictionary self.data_dictionary = model_struct['data_dictionary'] self.target = model_struct['target'] self.specificity = model_struct['specificity'] self.left = model_struct['left'] self.right = model_struct['right'] self.adapt = model_struct['adapt'] self.overshrink = model_struct['overshrink'] #Get the data into R self.data_frame = utils.DictionaryToR(self.data_dictionary) self.data_dictionary = copy.deepcopy(self.data_dictionary) self.predictors = len(self.data_dictionary.keys()) - 1 #Generate a PLS model in R. self.formula = r.Call('as.formula', obj=utils.SanitizeVariableName(self.target) + '~.') self.pls_params = {'formula' : self.formula, \ 'data' : self.data_frame, \ 'left' : self.left, \ 'right' : self.right, \ 'adapt' : self.adapt, \ 'overshrink' : self.overshrink} self.model = r.Call(function='censlars', **self.pls_params).AsList() #Get some information out of the model. self.GetActual() self.GetFitted() #Establish a decision threshold self.specificity = model_struct['specificity'] self.threshold = model_struct['threshold'] self.regulatory_threshold = model_struct['regulatory_threshold']
def Create(self, **args): #Check to see if a threshold has been specified in the function's arguments if 'regulatory_threshold' in args: self.threshold = args['regulatory_threshold'] else: self.threshold = 2.3711 # if there is no 'threshold' key, then use the default (2.3711) self.regulatory_threshold = self.threshold self.target = args['target'] if 'adapt' in args: self.adapt = args['adapt'] else: self.adapt = False if 'overshrink' in args: self.overshrink = args['overshrink'] else: self.overshrink = False if 'precondition' in args: self.precondition = args['precondition'] else: self.precondition = False if 'selectvars' in args: self.selectvars = args['selectvars'] else: self.selectvars = False if 'specificity' in args: specificity = args['specificity'] else: specificity = 0.9 #Get the data into R data = args['data'] self.data_frame = utils.DictionaryToR(data) self.data_dictionary = copy.copy(data) self.predictors = len(self.data_dictionary.keys()) - 1 #Generate a PLS model in R. self.formula = r.Call('as.formula', obj=utils.SanitizeVariableName(self.target) + '~.') self.pls_params = {'formula' : self.formula, \ 'data' : self.data_frame, \ 'adapt' : self.adapt, \ 'overshrink' : self.overshrink, \ 'precondition' : self.precondition, \ 'selectvars' : self.selectvars} self.model = r.Call(function='adalars', **self.pls_params).AsList() #Get some information out of the model self.GetActual() self.GetFitted() self.vars = [ str(v) for v in self.model['lars'].AsList()['vars'].AsVector() ] self.coefs = [ float(v) for v in self.model['lars'].AsList()['coefs'].AsVector() ] #Establish a decision threshold self.Threshold(specificity)
def Create(self, **args): #Check to see if a threshold has been specified in the function's arguments if 'regulatory_threshold' in args: self.threshold = args['regulatory_threshold'] else: self.threshold = 2.3711 # if there is no 'threshold' key, then use the default (2.3711) self.regulatory_threshold = self.threshold self.target = args['target'] if 'population' in args: self.population = args['population'] else: self.population = 200 if 'generations' in args: self.generations = args['generations'] else: self.generations = 100 if 'mutate' in args: self.mutate = args['mutate'] else: self.mutate = 0.02 if 'ZOR' in args: self.ZOR = args['ZOR'] else: self.ZOR = 10 if 'verbose' in args: self.verbose = args['verbose'] else: self.verbose = False if 'specificity' in args: specificity = args['specificity'] else: specificity = 0.90 #Get the data into R data = args['data'] self.data_frame = utils.DictionaryToR(data) self.data_dictionary = copy.copy(data) self.predictors = len(self.data_dictionary.keys()) - 1 #Generate a PLS model in R. self.formula = r.Call('as.formula', obj=utils.SanitizeVariableName(self.target) + '~.') self.pls_params = {'formula' : self.formula, \ 'data' : self.data_frame, \ 'population' : self.population, \ 'generations' : self.generations, \ 'mutateRate' : self.mutate, \ 'zeroOneRatio' : self.ZOR, \ 'verbose' : self.verbose} self.model = r.Call(function='galm', **self.pls_params).AsList() #Get some information out of the model self.GetActual() self.GetFitted() self.vars = [str(v) for v in self.model['vars'].AsVector()] #Establish a decision threshold self.Threshold(specificity)
def Create(self, **args): #Check to see if a threshold has been specified in the function's arguments if 'threshold' in args: self.threshold = args['threshold'] else: self.threshold = 2.3711 # if there is no 'threshold' key, then use the default (2.3711) self.regulatory_threshold = self.threshold if 'specificity' in args: specificity = args['specificity'] else: specificity = 0.9 #Get the data into R self.target = args['target'] data = self.data_dictionary = copy.copy(args['data']) self.data_frame = utils.DictionaryToR(data) self.num_predictors = len(self.data_dictionary.keys()) - 1 #Generate a PLS model in R. Special handling for only one predictor. self.formula = r.Call('as.formula', obj=utils.SanitizeVariableName(self.target) + '~.') if len(data) > 2: self.pls_params = {'formula' : self.formula, \ 'data' : self.data_frame, \ 'validation' : 'LOO', \ 'x' : True } else: self.pls_params = {'formula' : self.formula, \ 'data' : self.data_frame, \ 'validation' : 'none', \ 'x' : True } self.model = r.Call(function='plsr', **self.pls_params).AsList() #Get the number of columns from the validation step #(Might be fewer than the number of predictor variables if n<p) if len(data) > 2: self.ncomp_max = int( list( r.Call(function="dim", x=self.model['validation'].AsList() ['pred']).AsNumeric())[2]) else: self.ncomp_max = 1 #Use cross-validation to find the best number of components in the model. self.GetActual() if len(data) > 2: self.CrossValidation(**args) else: self.ncomp = 1 self.GetFitted() #Establish a decision threshold self.Threshold(specificity) self.vars = [str(v) for v in data.keys()] self.vars.remove(self.target)
def Create(self, **args): #Check to see if a threshold has been specified in the function's arguments if 'regulatory_threshold' in args: self.threshold = args['regulatory_threshold'] else: self.threshold = 2.3711 # if there is no 'threshold' key, then use the default (2.3711) self.regulatory_threshold = self.threshold self.target = args['target'] if 'left' in args: self.left = args['left'] else: self.left = -np.inf if 'right' in args: self.right = args['right'] else: self.right = np.inf if 'adapt' in args: self.adapt = args['adapt'] else: self.adapt = False if 'overshrink' in args: self.overshrink = args['overshrink'] else: self.overshrink = False if 'specificity' in args: specificity = args['specificity'] else: specificity = 0.9 #Get the data into R data = args['data'] self.data_frame = utils.DictionaryToR(data) self.data_dictionary = copy.deepcopy(data) self.predictors = len(self.data_dictionary.keys()) - 1 #Generate a PLS model in R. self.formula = r.Call('as.formula', obj=utils.SanitizeVariableName(self.target) + '~.') self.pls_params = {'formula' : self.formula, \ 'data' : self.data_frame, \ 'left' : self.left, \ 'right' : self.right, \ 'adapt' : self.adapt, \ 'overshrink' : self.overshrink} self.model = r.Call(function='censlars', **self.pls_params).AsList() #Get some information out of the model self.GetActual() self.GetFitted() #Establish a decision threshold self.Threshold(specificity)
def Deserialize(self, model_struct): #Unpack the model_struct dictionary self.data_dictionary = model_struct['data_dictionary'] self.target = model_struct['target'] self.specificity = model_struct['specificity'] self.adapt = model_struct['adapt'] self.overshrink = model_struct['overshrink'] self.precondition = model_struct['precondition'] self.selectvars = model_struct['selectvars'] #Get the data into R self.data_frame = utils.DictionaryToR(self.data_dictionary) self.data_dictionary = copy.copy(self.data_dictionary) self.predictors = len(self.data_dictionary.keys()) - 1 #Generate a PLS model in R. self.formula = r.Call('as.formula', obj=utils.SanitizeVariableName(self.target) + '~.') self.pls_params = {'formula' : self.formula, \ 'data' : self.data_frame, \ 'adapt' : self.adapt, \ 'overshrink' : self.overshrink, \ 'precondition' : self.precondition, \ 'selectvars' : self.selectvars} self.model = r.Call(function='adalars', **self.pls_params).AsList() #Get some information out of the model. self.GetActual() self.GetFitted() self.vars = [ str(v) for v in self.model['lars'].AsList()['vars'].AsVector() ] self.coefs = [ float(v) for v in self.model['lars'].AsList()['coefs'].AsVector() ] #Establish a decision threshold self.specificity = model_struct['specificity'] self.threshold = model_struct['threshold'] self.regulatory_threshold = model_struct['regulatory_threshold']
def Extract(self, model_part, **args): try: container = args['extract_from'] except KeyError: container = self.model #use R's coef function to extract the model coefficients if model_part == 'coef': step = self.model['lars'].AsList()['lambda.index'].AsVector( )[0] #r.Call(function='coef', object=self.model, ncomp=self.ncomp, intercept=True).AsList() coefobj = r.Call(function='coef', object=self.model.lars.AsList().model, mode='step', s=step) names = list(r.Call(function='names', x=coefobj).AsVector()) coefs = list(coefobj.AsVector()) part = dict(zip(names, coefs)) #use R's MSEP function to estimate the variance. elif model_part == 'MSEP': part = self.model['lars']['MSEP'] #use R's RMSEP function to estimate the standard error. elif model_part == 'RMSEP': part = self.model['lars']['RMSEP'] #Get the variable names, ordered as R sees them. elif model_part == 'names': part = ["Intercept"] part.extend(self.model['lars']['vars']) try: part.remove(utils.SanitizeVariableName(self.target)) except: pass #otherwise, go to the data structure itself else: part = container[model_part] return part
def Extract(self, model_part, **args): try: container = args['extract_from'] except KeyError: container = self.model #use R's coef function to extract the model coefficients if model_part == 'coef': part = list( r.Call(function='coef', object=self.model, ncomp=self.ncomp, intercept=True).AsVector()) #use R's MSEP function to estimate the variance. elif model_part == 'MSEP': part = sum([(self.fitted[i] - self.actual[i])**2 for i in range(len(self.fitted))]) / len(self.fitted) #use R's RMSEP function to estimate the standard error. elif model_part == 'RMSEP': part = (sum([(self.fitted[i] - self.actual[i])**2 for i in range(len(self.fitted))]) / len(self.fitted))**0.5 #Get the variable names, ordered as R sees them. elif model_part == 'names': part = ["Intercept"] part.extend(self.data_frame.ColumnNames) try: part.remove(utils.SanitizeVariableName(self.target)) except: pass #otherwise, go to the data structure itself else: part = container[model_part] return part
def Deserialize(self, model_struct): #Unpack the model_struct dictionary self.data_dictionary = model_struct['data_dictionary'] self.target = model_struct['target'] self.specificity = model_struct['specificity'] self.population = model_struct['population'] self.generations = model_struct['generations'] self.mutate = model_struct['mutate'] self.ZOR = model_struct['ZOR'] self.verbose = model_struct['verbose'] #Get the data into R self.data_frame = utils.DictionaryToR(self.data_dictionary) self.data_dictionary = copy.copy(self.data_dictionary) self.predictors = len(self.data_dictionary.keys()) - 1 #Generate a PLS model in R. self.formula = r.Call('as.formula', obj=utils.SanitizeVariableName(self.target) + '~.') self.pls_params = {'formula' : self.formula, \ 'data' : self.data_frame, \ 'population' : self.population, \ 'generations' : self.generations, \ 'mutateRate' : self.mutate, \ 'zeroOneRatio' : self.ZOR, \ 'verbose' : self.verbose } self.model = r.Call(function='galm', **self.pls_params).AsList() #Get some information out of the model. self.GetActual() self.GetFitted() #Establish a decision threshold self.specificity = model_struct['specificity'] self.threshold = model_struct['threshold'] self.regulatory_threshold = model_struct['regulatory_threshold']
def Create(self, **args): #Create a logistic model object #Check to see if a threshold has been specified in the function's arguments try: self.regulatory_threshold = args['regulatory_threshold'] except KeyError: self.regulatory_threshold = 2.3711 # if there is no 'threshold' key, then use the default (2.3711) #Check to see if a specificity has been specified in the function's arguments try: self.specificity = args['specificity'] except KeyError: self.specificity = 0.9 #Get the data into R data = args['data'] self.target = target = args['target'] self.nobs = len(data[self.target]) self.data_frame = utils.DictionaryToR(data) self.data_dictionary = copy.copy(data) self.predictors = len(self.data_dictionary.keys()) - 1 if 'population' in args: self.population = args['population'] else: self.population = 200 if 'generations' in args: self.generations = args['generations'] else: self.generations = 100 if 'mutate' in args: self.mutate = args['mutate'] else: self.mutate = 0.02 if 'ZOR' in args: self.ZOR = args['ZOR'] else: self.ZOR = 10 #Check to see if a weighting method has been specified in the function's arguments try: #integer (discrete) weighting if str(args['weights']).lower()[0] in ['d', 'i']: self.weights = self.AssignWeights(method=1) #float (continuous) weighting elif str(args['weights']).lower()[0] in ['c', 'f']: self.weights = self.AssignWeights(method=2) else: self.weights = self.AssignWeights(method=0) #If there is no 'weights' key, set all weights to one. except KeyError: self.weights = self.AssignWeights(method=0) #Label the exceedances in the training set. self.data_dictionary[target] = self.AssignLabels( self.data_dictionary[target]) #Get the data into R self.data_frame = utils.DictionaryToR(self.data_dictionary) #Generate a logistic regression model in R. self.formula = formula = r.Call( 'as.formula', obj=utils.SanitizeVariableName(self.target) + '~ .') self.logistic_params = {'formula' : formula, \ 'family' : 'binomial', \ 'data' : self.data_frame, \ 'weights' : self.weights, \ 'family' : 'binomial', \ 'population' : self.population, \ 'generations' : self.generations, \ 'mutateRate' : self.mutate, \ 'zeroOneRatio' : self.ZOR, \ 'verbose' : True } self.model = r.Call(function='galogistic', **self.logistic_params).AsList() #Select model components and a decision threshold self.GetActual() self.GetFitted() self.Threshold(self.specificity) self.vars = [str(v) for v in self.model['vars'].AsVector()]
def Deserialize(self, model_struct, scratchdir=""): #Unpack the model_struct dictionary self.data_dictionary = model_struct['data_dictionary'] self.target = model_struct['target'] self.specificity = model_struct['specificity'] #Get the data into R self.data_frame = utils.DictionaryToR(self.data_dictionary) self.data_dictionary = copy.copy(self.data_dictionary) self.num_predictors = len(self.data_dictionary.keys()) - 1 #First, save the serialized R object to disk (so it can be read from within R) robject_file = "pls" + "".join( random.choice(string.letters) for i in xrange(10)) + ".robj" if scratchdir: scratchdir = scratchdir.split(os.sep) scratchdir.append(robject_file) robject_file = os.sep.join(scratchdir) robject_file = robject_file.replace("\\", "\\\\") modelstring = model_struct["modelstring"] f = open(robject_file, "wb") f.write(modelstring) f.close() #Read the serialized model object into R: load_params = {'file': robject_file} objects = r.Call(function='load', **load_params).AsVector() get_params = {'x': str(objects[0])} self.model = r.Call(function="get", **get_params).AsList() os.remove(robject_file) #Generate a PLS model in R. self.formula = r.Call('as.formula', obj=utils.SanitizeVariableName(self.target) + '~.') if len(self.data_dictionary) > 2: self.pls_params = {'formula' : self.formula, \ 'data' : self.data_frame, \ 'validation' : 'LOO', \ 'x' : True } else: self.pls_params = {'formula' : self.formula, \ 'data' : self.data_frame, \ 'validation' : 'none', \ 'x' : True } #self.model = r.Call(function='plsr', **self.pls_params).AsList() #Get the number of columns from the validation step #(Might be fewer than the number of predictor variables if n<p) if len(self.data_dictionary) > 2: self.ncomp_max = int( list( r.Call(function="dim", x=self.model['validation'].AsList() ['pred']).AsNumeric())[2]) else: self.ncomp_max = 1 #Use cross-validation to find the best number of components in the model. self.GetActual() self.ncomp = model_struct['ncomp'] self.GetFitted() #Establish a decision threshold self.specificity = model_struct['specificity'] self.threshold = model_struct['threshold'] self.regulatory_threshold = model_struct['regulatory_threshold'] self.vars = [str(v) for v in self.data_dictionary.keys()] self.vars.remove(self.target)
def Create(self, **args): #Create a logistic model object #Check to see if a threshold has been specified in the function's arguments try: self.regulatory_threshold = args['regulatory_threshold'] except KeyError: self.regulatory_threshold = 2.3711 # if there is no 'threshold' key, then use the default (2.3711) #Check to see if a specificity has been specified in the function's arguments try: self.specificity = args['specificity'] except KeyError: self.specificity = 0.9 #Set the direction for stepwise variable selection try: self.stepdirection = stepdirection = args['stepdirection'] except KeyError: self.stepdirection = stepdirection = '' #Get the data into R data = args['data'] self.target = target = args['target'] self.nobs = len(data[self.target]) self.data_frame = utils.DictionaryToR(data) self.data_dictionary = copy.deepcopy(data) self.predictors = len(self.data_dictionary.keys()) - 1 #Check to see if a weighting method has been specified in the function's arguments try: #integer (discrete) weighting if str(args['weights']).lower()[0] in ['d', 'i']: self.weights = self.AssignWeights(method=1) #float (continuous) weighting elif str(args['weights']).lower()[0] in ['c', 'f']: self.weights = self.AssignWeights(method=2) else: self.weights = self.AssignWeights(method=0) #If there is no 'weights' key, set all weights to one. except KeyError: self.weights = self.AssignWeights(method=0) #Label the exceedances in the training set. self.data_dictionary[target] = self.AssignLabels( self.data_dictionary[target]) #Get the data into R self.data_frame = utils.DictionaryToR(self.data_dictionary) #Generate a logistic regression model in R. interceptonly = r.Call('as.formula', obj=utils.SanitizeVariableName(self.target) + '~ 1') self.logistic_params = {'formula' : interceptonly, \ 'family' : 'binomial', \ 'data' : self.data_frame, \ 'weights' : self.weights, \ 'x' : True } self.model = r.Call(function='glm', **self.logistic_params).AsList() #Select model components and a decision threshold self.SelectModel(direction=self.stepdirection) self.GetActual() self.GetFitted() self.Threshold(self.specificity)
def Create(self, **args): #Create a logistic model object #Check to see if a threshold has been specified in the function's arguments try: self.regulatory_threshold = args['regulatory_threshold'] except KeyError: self.regulatory_threshold = 2.3711 # if there is no 'threshold' key, then use the default (2.3711) #Check to see if a specificity has been specified in the function's arguments try: self.specificity = args['specificity'] except KeyError: self.specificity = 0.9 #Set the direction for stepwise variable selection #try: self.s = s = args['lambda'] #except KeyError: self.s = s = '' try: self.adapt = args['adapt'] except KeyError: self.adapt = False try: self.selectvars = args['selectvars'] except KeyError: self.selectvars = False try: self.overshrink = args['overshrink'] except KeyError: self.overshrink = False #Get the data into R data = args['data'] self.target = target = args['target'] self.nobs = len(data[self.target]) self.data_frame = utils.DictionaryToR(data) self.data_dictionary = copy.copy(data) self.predictors = len(self.data_dictionary.keys()) - 1 #Check to see if a weighting method has been specified in the function's arguments try: #integer (discrete) weighting if str(args['weights']).lower()[0] in ['d', 'i']: self.weights = self.AssignWeights(method=1) #float (continuous) weighting elif str(args['weights']).lower()[0] in ['c', 'f']: self.weights = self.AssignWeights(method=2) else: self.weights = self.AssignWeights(method=0) #If there is no 'weights' key, set all weights to one. except KeyError: self.weights = self.AssignWeights(method=0) #Label the exceedances in the training set. self.data_dictionary[target] = self.AssignLabels( self.data_dictionary[target]) #Get the data into R self.data_frame = utils.DictionaryToR(self.data_dictionary) #Generate a logistic regression model in R. self.formula = formula = r.Call( 'as.formula', obj=utils.SanitizeVariableName(self.target) + '~ .') self.logistic_params = {'formula' : formula, \ 'family' : 'binomial', \ 'data' : self.data_frame, \ 'weights' : self.weights, \ 'verbose' : True, \ 'adapt' : self.adapt, \ 'overshrink' : self.overshrink, \ 'selectvars' : self.selectvars} self.model = r.Call(function='adalasso', **self.logistic_params).AsList() #Select model components and a decision threshold self.GetActual() self.GetFitted() self.Threshold(self.specificity) self.vars = [ str(v) for v in self.model['lasso'].AsList()['vars'].AsVector() ]
def Create(self, **args): '''Create a new gbm model object''' #Check to see if a threshold has been specified in the function's arguments try: self.regulatory_threshold = args['threshold'] except KeyError: self.regulatory_threshold = 2.3711 # if there is no 'threshold' key, then use the default (2.3711) self.threshold = 0 #decision threshold #Check to see if a julian day has been specified in the function's arguments try: self.julian = args['julian'] except KeyError: self.julian = "" #Check to see if the maximum number of basis functions was specified. The default is 100. try: self.k = args['k'] except KeyError: self.k = 100 #Check to see if the penalty parameter was specified. The default is 1.4. try: self.penalty = args['lambda'] except KeyError: self.penalty = 1.4 if 'specificity' in args: specificity = args['specificity'] else: specificity = 0.9 #Store some object data self.data_dictionary = copy.deepcopy(args['data']) self.target = target = args['target'] #Get the data into R self.data_frame = utils.DictionaryToR(self.data_dictionary) #Generate a gam model in R. rows = len(self.data_dictionary.values()[0]) unique_values = map(lambda (x): np.unique(x).shape[0] - 1, np.array(self.data_dictionary.values())) self.predictors = predictors = self.data_dictionary.keys() try: indx = predictors.index(self.target) del (unique_values[indx]) predictors.remove(self.target) except: pass if self.julian: indx = predictors.index(self.julian) del (unique_values[indx]) predictors.remove(self.julian) self.k = np.min([self.k, np.floor(rows / len(predictors))]) formula = utils.SanitizeVariableName(self.target) + "~" for i in range(len(predictors)): if self.julian: formula += "s(" + utils.SanitizeVariableName( predictors[i]) + ", k=" + str( np.min([self.k, unique_values[i] ])) + ", by=" + utils.SanitizeVariableName( self.julian) + ")+" else: formula += "s(" + utils.SanitizeVariableName( predictors[i]) + ", k=" + str( np.min([self.k, unique_values[i]])) + ")+" formula = formula[:-1] self.formula = r.Call('as.formula', obj=formula) self.gbm_params = {'formula' : self.formula, \ 'family' : 'gaussian', \ 'data' : self.data_frame, \ 'lambda' : self.penalty } self.model = r.Call(function='gam', **self.gbm_params).AsList() #Use cross-validation to find the best number of components in the model. self.GetActual() self.GetFitted() #Establish a decision threshold self.Threshold(specificity)
def Deserialize(self, model_struct): '''Use the model_struct dictionary to recreate a model object''' #Unpack the model_struct dictionary self.data_dictionary = model_struct['data_dictionary'] self.target = model_struct['target'] self.specificity = model_struct['specificity'] self.julian = model_struct['julian'] self.k = model_struct['k'] self.penalty = model_struct['penalty'] #Get the data into R self.data_frame = utils.DictionaryToR(self.data_dictionary) self.data_dictionary = copy.deepcopy(self.data_dictionary) self.predictors = len(self.data_dictionary.keys()) - 1 #Generate a gam model in R. rows = len(self.data_dictionary.values()[0]) unique_values = map(lambda (x): np.unique(x).shape[0] - 1, np.array(self.data_dictionary.values())) self.predictors = predictors = self.data_dictionary.keys() try: indx = predictors.index(self.target) del (unique_values[indx]) predictors.remove(self.target) except: pass if self.julian: indx = predictors.index(self.julian) del (unique_values[indx]) predictors.remove(self.julian) self.k = np.min([self.k, np.floor(rows / len(predictors))]) formula = utils.SanitizeVariableName(self.target) + "~" for i in range(len(predictors)): if self.julian: formula += "s(" + utils.SanitizeVariableName( predictors[i]) + ", k=" + str( np.min([self.k, unique_values[i] ])) + ", by=" + utils.SanitizeVariableName( self.julian) + ")+" else: formula += "s(" + utils.SanitizeVariableName( predictors[i]) + ", k=" + str( np.min([self.k, unique_values[i]])) + ")+" formula = formula[:-1] self.formula = r.Call('as.formula', obj=formula) self.gbm_params = {'formula' : self.formula, \ 'family' : 'gaussian', \ 'data' : self.data_frame, \ 'lambda' : self.penalty } self.model = r.Call(function='gam', **self.gbm_params).AsList() #Use cross-validation to find the best number of components in the model. self.GetActual() self.GetFitted() #Establish a decision threshold self.threshold = model_struct['threshold'] self.regulatory_threshold = model_struct['regulatory_threshold']