Example #1
0
    def SelectModel(self, direction):
        predictors = self.predictors

        #Make a formula that includes all of the possible predictors
        names = self.data_dictionary.keys()
        names.remove(self.target)
        allvariables = utils.SanitizeVariableName(self.target) + "~"
        for i in range(len(names)):
            allvariables += utils.SanitizeVariableName(names[i]) + "+"
        allvariables = allvariables[:-1]

        #Use this saturated model as the maximum scope for variable selection via the BIC
        allvariables = r.Call('as.formula', obj=allvariables)
        self.logistic_params['formula'] = allvariables
        if direction:
            self.model = r.Call(function='step',
                                object=self.model,
                                direction=direction,
                                scope=allvariables,
                                k=np.log(self.nobs)).AsList()

            #Extract the variables that were selected for the model
            vars = r.Call(function='names',
                          x=self.model['coefficients'].AsList()).AsVector()
            vars = [str(v) for v in vars][1:]

            #Make a formula that includes just the selected predictors, and add it to the list of model-building parameters
            formula = utils.SanitizeVariableName(self.target) + "~"
            for i in range(len(vars)):
                formula += vars[i] + "+"
            formula = formula[:-1]
            self.formula = r.Call('as.formula', obj=formula)
            self.logistic_params['formula'] = self.formula
            self.model = r.Call(function='glm',
                                **self.logistic_params).AsList()
Example #2
0
    def Deserialize(self, model_struct):
        #Unpack the model_struct dictionary
        self.data_dictionary = model_struct['data_dictionary']
        self.target = model_struct['target']
        self.specificity = model_struct['specificity']
        self.left = model_struct['left']
        self.right = model_struct['right']
        self.adapt = model_struct['adapt']
        self.overshrink = model_struct['overshrink']

        #Get the data into R
        self.data_frame = utils.DictionaryToR(self.data_dictionary)
        self.data_dictionary = copy.deepcopy(self.data_dictionary)
        self.predictors = len(self.data_dictionary.keys()) - 1

        #Generate a PLS model in R.
        self.formula = r.Call('as.formula',
                              obj=utils.SanitizeVariableName(self.target) +
                              '~.')
        self.pls_params = {'formula' : self.formula, \
            'data' : self.data_frame, \
            'left' : self.left, \
            'right' : self.right, \
            'adapt' : self.adapt, \
            'overshrink' : self.overshrink}
        self.model = r.Call(function='censlars', **self.pls_params).AsList()

        #Get some information out of the model.
        self.GetActual()
        self.GetFitted()

        #Establish a decision threshold
        self.specificity = model_struct['specificity']
        self.threshold = model_struct['threshold']
        self.regulatory_threshold = model_struct['regulatory_threshold']
Example #3
0
    def Create(self, **args):
        #Check to see if a threshold has been specified in the function's arguments
        if 'regulatory_threshold' in args:
            self.threshold = args['regulatory_threshold']
        else:
            self.threshold = 2.3711  # if there is no 'threshold' key, then use the default (2.3711)
        self.regulatory_threshold = self.threshold

        self.target = args['target']

        if 'adapt' in args: self.adapt = args['adapt']
        else: self.adapt = False

        if 'overshrink' in args: self.overshrink = args['overshrink']
        else: self.overshrink = False

        if 'precondition' in args: self.precondition = args['precondition']
        else: self.precondition = False

        if 'selectvars' in args: self.selectvars = args['selectvars']
        else: self.selectvars = False

        if 'specificity' in args: specificity = args['specificity']
        else: specificity = 0.9

        #Get the data into R
        data = args['data']
        self.data_frame = utils.DictionaryToR(data)
        self.data_dictionary = copy.copy(data)
        self.predictors = len(self.data_dictionary.keys()) - 1

        #Generate a PLS model in R.
        self.formula = r.Call('as.formula',
                              obj=utils.SanitizeVariableName(self.target) +
                              '~.')
        self.pls_params = {'formula' : self.formula, \
            'data' : self.data_frame, \
            'adapt' : self.adapt, \
            'overshrink' : self.overshrink, \
            'precondition' : self.precondition, \
            'selectvars' : self.selectvars}
        self.model = r.Call(function='adalars', **self.pls_params).AsList()

        #Get some information out of the model
        self.GetActual()
        self.GetFitted()
        self.vars = [
            str(v) for v in self.model['lars'].AsList()['vars'].AsVector()
        ]
        self.coefs = [
            float(v) for v in self.model['lars'].AsList()['coefs'].AsVector()
        ]

        #Establish a decision threshold
        self.Threshold(specificity)
Example #4
0
    def Create(self, **args):
        #Check to see if a threshold has been specified in the function's arguments
        if 'regulatory_threshold' in args:
            self.threshold = args['regulatory_threshold']
        else:
            self.threshold = 2.3711  # if there is no 'threshold' key, then use the default (2.3711)
        self.regulatory_threshold = self.threshold

        self.target = args['target']

        if 'population' in args: self.population = args['population']
        else: self.population = 200

        if 'generations' in args: self.generations = args['generations']
        else: self.generations = 100

        if 'mutate' in args: self.mutate = args['mutate']
        else: self.mutate = 0.02

        if 'ZOR' in args: self.ZOR = args['ZOR']
        else: self.ZOR = 10

        if 'verbose' in args: self.verbose = args['verbose']
        else: self.verbose = False

        if 'specificity' in args: specificity = args['specificity']
        else: specificity = 0.90

        #Get the data into R
        data = args['data']
        self.data_frame = utils.DictionaryToR(data)
        self.data_dictionary = copy.copy(data)
        self.predictors = len(self.data_dictionary.keys()) - 1

        #Generate a PLS model in R.
        self.formula = r.Call('as.formula',
                              obj=utils.SanitizeVariableName(self.target) +
                              '~.')
        self.pls_params = {'formula' : self.formula, \
            'data' : self.data_frame, \
            'population' : self.population, \
            'generations' : self.generations, \
            'mutateRate' : self.mutate, \
            'zeroOneRatio' : self.ZOR, \
            'verbose' : self.verbose}
        self.model = r.Call(function='galm', **self.pls_params).AsList()

        #Get some information out of the model
        self.GetActual()
        self.GetFitted()
        self.vars = [str(v) for v in self.model['vars'].AsVector()]

        #Establish a decision threshold
        self.Threshold(specificity)
Example #5
0
    def Create(self, **args):
        #Check to see if a threshold has been specified in the function's arguments
        if 'threshold' in args: self.threshold = args['threshold']
        else:
            self.threshold = 2.3711  # if there is no 'threshold' key, then use the default (2.3711)
        self.regulatory_threshold = self.threshold

        if 'specificity' in args: specificity = args['specificity']
        else: specificity = 0.9

        #Get the data into R
        self.target = args['target']
        data = self.data_dictionary = copy.copy(args['data'])
        self.data_frame = utils.DictionaryToR(data)
        self.num_predictors = len(self.data_dictionary.keys()) - 1

        #Generate a PLS model in R. Special handling for only one predictor.
        self.formula = r.Call('as.formula',
                              obj=utils.SanitizeVariableName(self.target) +
                              '~.')
        if len(data) > 2:
            self.pls_params = {'formula' : self.formula, \
                'data' : self.data_frame, \
                'validation' : 'LOO', \
                'x' : True }
        else:
            self.pls_params = {'formula' : self.formula, \
                'data' : self.data_frame, \
                'validation' : 'none', \
                'x' : True }
        self.model = r.Call(function='plsr', **self.pls_params).AsList()

        #Get the number of columns from the validation step
        #(Might be fewer than the number of predictor variables if n<p)
        if len(data) > 2:
            self.ncomp_max = int(
                list(
                    r.Call(function="dim",
                           x=self.model['validation'].AsList()
                           ['pred']).AsNumeric())[2])
        else:
            self.ncomp_max = 1

        #Use cross-validation to find the best number of components in the model.
        self.GetActual()
        if len(data) > 2: self.CrossValidation(**args)
        else: self.ncomp = 1
        self.GetFitted()

        #Establish a decision threshold
        self.Threshold(specificity)
        self.vars = [str(v) for v in data.keys()]
        self.vars.remove(self.target)
Example #6
0
    def Create(self, **args):
        #Check to see if a threshold has been specified in the function's arguments
        if 'regulatory_threshold' in args:
            self.threshold = args['regulatory_threshold']
        else:
            self.threshold = 2.3711  # if there is no 'threshold' key, then use the default (2.3711)
        self.regulatory_threshold = self.threshold

        self.target = args['target']

        if 'left' in args: self.left = args['left']
        else: self.left = -np.inf

        if 'right' in args: self.right = args['right']
        else: self.right = np.inf

        if 'adapt' in args: self.adapt = args['adapt']
        else: self.adapt = False

        if 'overshrink' in args: self.overshrink = args['overshrink']
        else: self.overshrink = False

        if 'specificity' in args: specificity = args['specificity']
        else: specificity = 0.9

        #Get the data into R
        data = args['data']
        self.data_frame = utils.DictionaryToR(data)
        self.data_dictionary = copy.deepcopy(data)
        self.predictors = len(self.data_dictionary.keys()) - 1

        #Generate a PLS model in R.
        self.formula = r.Call('as.formula',
                              obj=utils.SanitizeVariableName(self.target) +
                              '~.')
        self.pls_params = {'formula' : self.formula, \
            'data' : self.data_frame, \
            'left' : self.left, \
            'right' : self.right, \
            'adapt' : self.adapt, \
            'overshrink' : self.overshrink}
        self.model = r.Call(function='censlars', **self.pls_params).AsList()

        #Get some information out of the model
        self.GetActual()
        self.GetFitted()

        #Establish a decision threshold
        self.Threshold(specificity)
Example #7
0
    def Deserialize(self, model_struct):
        #Unpack the model_struct dictionary
        self.data_dictionary = model_struct['data_dictionary']
        self.target = model_struct['target']
        self.specificity = model_struct['specificity']
        self.adapt = model_struct['adapt']
        self.overshrink = model_struct['overshrink']
        self.precondition = model_struct['precondition']
        self.selectvars = model_struct['selectvars']

        #Get the data into R
        self.data_frame = utils.DictionaryToR(self.data_dictionary)
        self.data_dictionary = copy.copy(self.data_dictionary)
        self.predictors = len(self.data_dictionary.keys()) - 1

        #Generate a PLS model in R.
        self.formula = r.Call('as.formula',
                              obj=utils.SanitizeVariableName(self.target) +
                              '~.')
        self.pls_params = {'formula' : self.formula, \
            'data' : self.data_frame, \
            'adapt' : self.adapt, \
            'overshrink' : self.overshrink, \
            'precondition' : self.precondition, \
            'selectvars' : self.selectvars}
        self.model = r.Call(function='adalars', **self.pls_params).AsList()

        #Get some information out of the model.
        self.GetActual()
        self.GetFitted()
        self.vars = [
            str(v) for v in self.model['lars'].AsList()['vars'].AsVector()
        ]
        self.coefs = [
            float(v) for v in self.model['lars'].AsList()['coefs'].AsVector()
        ]

        #Establish a decision threshold
        self.specificity = model_struct['specificity']
        self.threshold = model_struct['threshold']
        self.regulatory_threshold = model_struct['regulatory_threshold']
Example #8
0
    def Extract(self, model_part, **args):
        try:
            container = args['extract_from']
        except KeyError:
            container = self.model

        #use R's coef function to extract the model coefficients
        if model_part == 'coef':
            step = self.model['lars'].AsList()['lambda.index'].AsVector(
            )[0]  #r.Call(function='coef', object=self.model, ncomp=self.ncomp, intercept=True).AsList()
            coefobj = r.Call(function='coef',
                             object=self.model.lars.AsList().model,
                             mode='step',
                             s=step)
            names = list(r.Call(function='names', x=coefobj).AsVector())
            coefs = list(coefobj.AsVector())
            part = dict(zip(names, coefs))

        #use R's MSEP function to estimate the variance.
        elif model_part == 'MSEP':
            part = self.model['lars']['MSEP']

        #use R's RMSEP function to estimate the standard error.
        elif model_part == 'RMSEP':
            part = self.model['lars']['RMSEP']

        #Get the variable names, ordered as R sees them.
        elif model_part == 'names':
            part = ["Intercept"]
            part.extend(self.model['lars']['vars'])
            try:
                part.remove(utils.SanitizeVariableName(self.target))
            except:
                pass

        #otherwise, go to the data structure itself
        else:
            part = container[model_part]

        return part
Example #9
0
    def Extract(self, model_part, **args):
        try:
            container = args['extract_from']
        except KeyError:
            container = self.model

        #use R's coef function to extract the model coefficients
        if model_part == 'coef':
            part = list(
                r.Call(function='coef',
                       object=self.model,
                       ncomp=self.ncomp,
                       intercept=True).AsVector())

        #use R's MSEP function to estimate the variance.
        elif model_part == 'MSEP':
            part = sum([(self.fitted[i] - self.actual[i])**2
                        for i in range(len(self.fitted))]) / len(self.fitted)

        #use R's RMSEP function to estimate the standard error.
        elif model_part == 'RMSEP':
            part = (sum([(self.fitted[i] - self.actual[i])**2
                         for i in range(len(self.fitted))]) /
                    len(self.fitted))**0.5

        #Get the variable names, ordered as R sees them.
        elif model_part == 'names':
            part = ["Intercept"]
            part.extend(self.data_frame.ColumnNames)
            try:
                part.remove(utils.SanitizeVariableName(self.target))
            except:
                pass

        #otherwise, go to the data structure itself
        else:
            part = container[model_part]

        return part
Example #10
0
    def Deserialize(self, model_struct):
        #Unpack the model_struct dictionary
        self.data_dictionary = model_struct['data_dictionary']
        self.target = model_struct['target']
        self.specificity = model_struct['specificity']
        self.population = model_struct['population']
        self.generations = model_struct['generations']
        self.mutate = model_struct['mutate']
        self.ZOR = model_struct['ZOR']
        self.verbose = model_struct['verbose']

        #Get the data into R
        self.data_frame = utils.DictionaryToR(self.data_dictionary)
        self.data_dictionary = copy.copy(self.data_dictionary)
        self.predictors = len(self.data_dictionary.keys()) - 1

        #Generate a PLS model in R.
        self.formula = r.Call('as.formula',
                              obj=utils.SanitizeVariableName(self.target) +
                              '~.')
        self.pls_params = {'formula' : self.formula, \
            'data' : self.data_frame, \
            'population' : self.population, \
            'generations' : self.generations, \
            'mutateRate' : self.mutate, \
            'zeroOneRatio' : self.ZOR, \
            'verbose' : self.verbose }
        self.model = r.Call(function='galm', **self.pls_params).AsList()

        #Get some information out of the model.
        self.GetActual()
        self.GetFitted()

        #Establish a decision threshold
        self.specificity = model_struct['specificity']
        self.threshold = model_struct['threshold']
        self.regulatory_threshold = model_struct['regulatory_threshold']
Example #11
0
    def Create(self, **args):
        #Create a logistic model object

        #Check to see if a threshold has been specified in the function's arguments
        try:
            self.regulatory_threshold = args['regulatory_threshold']
        except KeyError:
            self.regulatory_threshold = 2.3711  # if there is no 'threshold' key, then use the default (2.3711)

        #Check to see if a specificity has been specified in the function's arguments
        try:
            self.specificity = args['specificity']
        except KeyError:
            self.specificity = 0.9

        #Get the data into R
        data = args['data']
        self.target = target = args['target']
        self.nobs = len(data[self.target])
        self.data_frame = utils.DictionaryToR(data)
        self.data_dictionary = copy.copy(data)
        self.predictors = len(self.data_dictionary.keys()) - 1

        if 'population' in args: self.population = args['population']
        else: self.population = 200

        if 'generations' in args: self.generations = args['generations']
        else: self.generations = 100

        if 'mutate' in args: self.mutate = args['mutate']
        else: self.mutate = 0.02

        if 'ZOR' in args: self.ZOR = args['ZOR']
        else: self.ZOR = 10

        #Check to see if a weighting method has been specified in the function's arguments
        try:
            #integer (discrete) weighting
            if str(args['weights']).lower()[0] in ['d', 'i']:
                self.weights = self.AssignWeights(method=1)

            #float (continuous) weighting
            elif str(args['weights']).lower()[0] in ['c', 'f']:
                self.weights = self.AssignWeights(method=2)

            else:
                self.weights = self.AssignWeights(method=0)

        #If there is no 'weights' key, set all weights to one.
        except KeyError:
            self.weights = self.AssignWeights(method=0)

        #Label the exceedances in the training set.
        self.data_dictionary[target] = self.AssignLabels(
            self.data_dictionary[target])

        #Get the data into R
        self.data_frame = utils.DictionaryToR(self.data_dictionary)

        #Generate a logistic regression model in R.
        self.formula = formula = r.Call(
            'as.formula', obj=utils.SanitizeVariableName(self.target) + '~ .')
        self.logistic_params = {'formula' : formula, \
            'family' : 'binomial', \
            'data' : self.data_frame, \
            'weights' : self.weights, \
            'family' : 'binomial', \
            'population' : self.population, \
            'generations' : self.generations, \
            'mutateRate' : self.mutate, \
            'zeroOneRatio' : self.ZOR, \
            'verbose' : True }
        self.model = r.Call(function='galogistic',
                            **self.logistic_params).AsList()

        #Select model components and a decision threshold
        self.GetActual()
        self.GetFitted()
        self.Threshold(self.specificity)
        self.vars = [str(v) for v in self.model['vars'].AsVector()]
Example #12
0
    def Deserialize(self, model_struct, scratchdir=""):
        #Unpack the model_struct dictionary
        self.data_dictionary = model_struct['data_dictionary']
        self.target = model_struct['target']
        self.specificity = model_struct['specificity']

        #Get the data into R
        self.data_frame = utils.DictionaryToR(self.data_dictionary)
        self.data_dictionary = copy.copy(self.data_dictionary)
        self.num_predictors = len(self.data_dictionary.keys()) - 1

        #First, save the serialized R object to disk (so it can be read from within R)
        robject_file = "pls" + "".join(
            random.choice(string.letters) for i in xrange(10)) + ".robj"
        if scratchdir:
            scratchdir = scratchdir.split(os.sep)
            scratchdir.append(robject_file)
            robject_file = os.sep.join(scratchdir)
        robject_file = robject_file.replace("\\", "\\\\")

        modelstring = model_struct["modelstring"]
        f = open(robject_file, "wb")
        f.write(modelstring)
        f.close()

        #Read the serialized model object into R:
        load_params = {'file': robject_file}
        objects = r.Call(function='load', **load_params).AsVector()
        get_params = {'x': str(objects[0])}
        self.model = r.Call(function="get", **get_params).AsList()
        os.remove(robject_file)

        #Generate a PLS model in R.
        self.formula = r.Call('as.formula',
                              obj=utils.SanitizeVariableName(self.target) +
                              '~.')
        if len(self.data_dictionary) > 2:
            self.pls_params = {'formula' : self.formula, \
                'data' : self.data_frame, \
                'validation' : 'LOO', \
                'x' : True }
        else:
            self.pls_params = {'formula' : self.formula, \
                'data' : self.data_frame, \
                'validation' : 'none', \
                'x' : True }
        #self.model = r.Call(function='plsr', **self.pls_params).AsList()

        #Get the number of columns from the validation step
        #(Might be fewer than the number of predictor variables if n<p)
        if len(self.data_dictionary) > 2:
            self.ncomp_max = int(
                list(
                    r.Call(function="dim",
                           x=self.model['validation'].AsList()
                           ['pred']).AsNumeric())[2])
        else:
            self.ncomp_max = 1

        #Use cross-validation to find the best number of components in the model.
        self.GetActual()
        self.ncomp = model_struct['ncomp']
        self.GetFitted()

        #Establish a decision threshold
        self.specificity = model_struct['specificity']
        self.threshold = model_struct['threshold']
        self.regulatory_threshold = model_struct['regulatory_threshold']
        self.vars = [str(v) for v in self.data_dictionary.keys()]
        self.vars.remove(self.target)
Example #13
0
    def Create(self, **args):
        #Create a logistic model object

        #Check to see if a threshold has been specified in the function's arguments
        try:
            self.regulatory_threshold = args['regulatory_threshold']
        except KeyError:
            self.regulatory_threshold = 2.3711  # if there is no 'threshold' key, then use the default (2.3711)

        #Check to see if a specificity has been specified in the function's arguments
        try:
            self.specificity = args['specificity']
        except KeyError:
            self.specificity = 0.9

        #Set the direction for stepwise variable selection
        try:
            self.stepdirection = stepdirection = args['stepdirection']
        except KeyError:
            self.stepdirection = stepdirection = ''

        #Get the data into R
        data = args['data']
        self.target = target = args['target']
        self.nobs = len(data[self.target])
        self.data_frame = utils.DictionaryToR(data)
        self.data_dictionary = copy.deepcopy(data)
        self.predictors = len(self.data_dictionary.keys()) - 1

        #Check to see if a weighting method has been specified in the function's arguments
        try:
            #integer (discrete) weighting
            if str(args['weights']).lower()[0] in ['d', 'i']:
                self.weights = self.AssignWeights(method=1)

            #float (continuous) weighting
            elif str(args['weights']).lower()[0] in ['c', 'f']:
                self.weights = self.AssignWeights(method=2)

            else:
                self.weights = self.AssignWeights(method=0)

        #If there is no 'weights' key, set all weights to one.
        except KeyError:
            self.weights = self.AssignWeights(method=0)

        #Label the exceedances in the training set.
        self.data_dictionary[target] = self.AssignLabels(
            self.data_dictionary[target])

        #Get the data into R
        self.data_frame = utils.DictionaryToR(self.data_dictionary)

        #Generate a logistic regression model in R.
        interceptonly = r.Call('as.formula',
                               obj=utils.SanitizeVariableName(self.target) +
                               '~ 1')
        self.logistic_params = {'formula' : interceptonly, \
            'family' : 'binomial', \
            'data' : self.data_frame, \
            'weights' : self.weights, \
            'x' : True }
        self.model = r.Call(function='glm', **self.logistic_params).AsList()

        #Select model components and a decision threshold
        self.SelectModel(direction=self.stepdirection)
        self.GetActual()
        self.GetFitted()
        self.Threshold(self.specificity)
Example #14
0
    def Create(self, **args):
        #Create a logistic model object

        #Check to see if a threshold has been specified in the function's arguments
        try:
            self.regulatory_threshold = args['regulatory_threshold']
        except KeyError:
            self.regulatory_threshold = 2.3711  # if there is no 'threshold' key, then use the default (2.3711)

        #Check to see if a specificity has been specified in the function's arguments
        try:
            self.specificity = args['specificity']
        except KeyError:
            self.specificity = 0.9

        #Set the direction for stepwise variable selection
        #try: self.s = s = args['lambda']
        #except KeyError: self.s = s = ''

        try:
            self.adapt = args['adapt']
        except KeyError:
            self.adapt = False

        try:
            self.selectvars = args['selectvars']
        except KeyError:
            self.selectvars = False

        try:
            self.overshrink = args['overshrink']
        except KeyError:
            self.overshrink = False

        #Get the data into R
        data = args['data']
        self.target = target = args['target']
        self.nobs = len(data[self.target])
        self.data_frame = utils.DictionaryToR(data)
        self.data_dictionary = copy.copy(data)
        self.predictors = len(self.data_dictionary.keys()) - 1

        #Check to see if a weighting method has been specified in the function's arguments
        try:
            #integer (discrete) weighting
            if str(args['weights']).lower()[0] in ['d', 'i']:
                self.weights = self.AssignWeights(method=1)

            #float (continuous) weighting
            elif str(args['weights']).lower()[0] in ['c', 'f']:
                self.weights = self.AssignWeights(method=2)

            else:
                self.weights = self.AssignWeights(method=0)

        #If there is no 'weights' key, set all weights to one.
        except KeyError:
            self.weights = self.AssignWeights(method=0)

        #Label the exceedances in the training set.
        self.data_dictionary[target] = self.AssignLabels(
            self.data_dictionary[target])

        #Get the data into R
        self.data_frame = utils.DictionaryToR(self.data_dictionary)

        #Generate a logistic regression model in R.
        self.formula = formula = r.Call(
            'as.formula', obj=utils.SanitizeVariableName(self.target) + '~ .')
        self.logistic_params = {'formula' : formula, \
            'family' : 'binomial', \
            'data' : self.data_frame, \
            'weights' : self.weights, \
            'verbose' : True, \
            'adapt' : self.adapt, \
            'overshrink' : self.overshrink, \
            'selectvars' : self.selectvars}
        self.model = r.Call(function='adalasso',
                            **self.logistic_params).AsList()

        #Select model components and a decision threshold
        self.GetActual()
        self.GetFitted()
        self.Threshold(self.specificity)
        self.vars = [
            str(v) for v in self.model['lasso'].AsList()['vars'].AsVector()
        ]
Example #15
0
    def Create(self, **args):
        '''Create a new gbm model object'''

        #Check to see if a threshold has been specified in the function's arguments
        try:
            self.regulatory_threshold = args['threshold']
        except KeyError:
            self.regulatory_threshold = 2.3711  # if there is no 'threshold' key, then use the default (2.3711)
        self.threshold = 0  #decision threshold

        #Check to see if a julian day has been specified in the function's arguments
        try:
            self.julian = args['julian']
        except KeyError:
            self.julian = ""

        #Check to see if the maximum number of basis functions was specified. The default is 100.
        try:
            self.k = args['k']
        except KeyError:
            self.k = 100

        #Check to see if the penalty parameter was specified. The default is 1.4.
        try:
            self.penalty = args['lambda']
        except KeyError:
            self.penalty = 1.4

        if 'specificity' in args: specificity = args['specificity']
        else: specificity = 0.9

        #Store some object data
        self.data_dictionary = copy.deepcopy(args['data'])
        self.target = target = args['target']

        #Get the data into R
        self.data_frame = utils.DictionaryToR(self.data_dictionary)

        #Generate a gam model in R.
        rows = len(self.data_dictionary.values()[0])
        unique_values = map(lambda (x): np.unique(x).shape[0] - 1,
                            np.array(self.data_dictionary.values()))
        self.predictors = predictors = self.data_dictionary.keys()
        try:
            indx = predictors.index(self.target)
            del (unique_values[indx])
            predictors.remove(self.target)
        except:
            pass
        if self.julian:
            indx = predictors.index(self.julian)
            del (unique_values[indx])
            predictors.remove(self.julian)
        self.k = np.min([self.k, np.floor(rows / len(predictors))])

        formula = utils.SanitizeVariableName(self.target) + "~"
        for i in range(len(predictors)):
            if self.julian:
                formula += "s(" + utils.SanitizeVariableName(
                    predictors[i]) + ", k=" + str(
                        np.min([self.k, unique_values[i]
                                ])) + ", by=" + utils.SanitizeVariableName(
                                    self.julian) + ")+"
            else:
                formula += "s(" + utils.SanitizeVariableName(
                    predictors[i]) + ", k=" + str(
                        np.min([self.k, unique_values[i]])) + ")+"
        formula = formula[:-1]

        self.formula = r.Call('as.formula', obj=formula)
        self.gbm_params = {'formula' : self.formula, \
            'family' : 'gaussian', \
            'data' : self.data_frame, \
            'lambda' : self.penalty }
        self.model = r.Call(function='gam', **self.gbm_params).AsList()

        #Use cross-validation to find the best number of components in the model.
        self.GetActual()
        self.GetFitted()

        #Establish a decision threshold
        self.Threshold(specificity)
Example #16
0
    def Deserialize(self, model_struct):
        '''Use the model_struct dictionary to recreate a model object'''

        #Unpack the model_struct dictionary
        self.data_dictionary = model_struct['data_dictionary']
        self.target = model_struct['target']
        self.specificity = model_struct['specificity']
        self.julian = model_struct['julian']
        self.k = model_struct['k']
        self.penalty = model_struct['penalty']

        #Get the data into R
        self.data_frame = utils.DictionaryToR(self.data_dictionary)
        self.data_dictionary = copy.deepcopy(self.data_dictionary)
        self.predictors = len(self.data_dictionary.keys()) - 1

        #Generate a gam model in R.
        rows = len(self.data_dictionary.values()[0])
        unique_values = map(lambda (x): np.unique(x).shape[0] - 1,
                            np.array(self.data_dictionary.values()))
        self.predictors = predictors = self.data_dictionary.keys()
        try:
            indx = predictors.index(self.target)
            del (unique_values[indx])
            predictors.remove(self.target)
        except:
            pass
        if self.julian:
            indx = predictors.index(self.julian)
            del (unique_values[indx])
            predictors.remove(self.julian)
        self.k = np.min([self.k, np.floor(rows / len(predictors))])

        formula = utils.SanitizeVariableName(self.target) + "~"
        for i in range(len(predictors)):
            if self.julian:
                formula += "s(" + utils.SanitizeVariableName(
                    predictors[i]) + ", k=" + str(
                        np.min([self.k, unique_values[i]
                                ])) + ", by=" + utils.SanitizeVariableName(
                                    self.julian) + ")+"
            else:
                formula += "s(" + utils.SanitizeVariableName(
                    predictors[i]) + ", k=" + str(
                        np.min([self.k, unique_values[i]])) + ")+"
        formula = formula[:-1]

        self.formula = r.Call('as.formula', obj=formula)
        self.gbm_params = {'formula' : self.formula, \
            'family' : 'gaussian', \
            'data' : self.data_frame, \
            'lambda' : self.penalty }
        self.model = r.Call(function='gam', **self.gbm_params).AsList()

        #Use cross-validation to find the best number of components in the model.
        self.GetActual()
        self.GetFitted()

        #Establish a decision threshold
        self.threshold = model_struct['threshold']
        self.regulatory_threshold = model_struct['regulatory_threshold']