Exemple #1
0
 def __init__(self):
     os.chdir("Data")
     self.train = pd.read_csv("train.csv", index_col= 0)
     self.test = pd.read_csv("train.csv", index_col= 0)
     self.dqTool = DataQualityTool(self.train)
     self.dataAnalyzer = da.DataAnalyzer()
     self.regressionTool = da.RegressionTool(self.train, 0)
Exemple #2
0
class TitanicSurivalModel:
    ## Initialize the model
    def __init__(self):
        os.chdir("Data")
        self.train = pd.read_csv("train.csv", index_col= 0)
        self.test = pd.read_csv("train.csv", index_col= 0)
        self.dqTool = DataQualityTool(self.train)
        self.dataAnalyzer = da.DataAnalyzer()
        self.regressionTool = da.RegressionTool(self.train, 0)
        
    ##Explortory Data Analysis
    ##Graphing each column to explore distrubtions and survivor rates
    def graphAge(self):
        self.train['Age'].hist()
        P.show()
        
    ## Get the training data set
    def getTrain(self):
        return self.train
        
    ## Set the training data set
    def setTrain(self, data):
        self.train = data
        
    ## Get the test data set of this model
    def getTest(self):
        return self.test
        
    ## Set the test data set
    def setTest(self, data):
        self.test = data
        
    ## Checks data for missing values
    def analyzeDataQuality(self):
        return self.dqTool.analyze()
        
    ## Takes in a string and a list of strings, and returns the first string 
    ## from the list that is found in the first argument, or null if none of 
    ## the listed strings are found. 
    def substrings_in_string(self, big_string, substrings):
        for substring in substrings:
            if string.find(big_string, substring) != -1:
                return substring
        print "Null value! None of the substrings were found!"
        print big_string
        return np.nan

    ## Converts titles to mr, mrs, miss, master, if applicables
    ## Even checks for gender of doctors :)    
    def replace_titles(self, data):
        title=data['Title']
        if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Countess', 'Mme']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms']:
            return 'Miss'
        elif title =='Dr':
            if data['Sex']=='Male':
                return 'Mr'
            else:
                return 'Mrs'
        else:
            return title
            
    ## Returns the given dataset, but with a title field, where a title is one
    ## of "Mr", "Mrs", or "Miss" 
    def calcTitles(self, data):
        title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']
        data['Title']=data['Name'].map(lambda x: self.substrings_in_string(x, title_list))
        data['Title']=data.apply(self.replace_titles, axis=1)
        return data
    
    ## Calculates the family size for each person in the data set by adding the
    ## number of sibblines, number of parents, and the person themselves, then 
    ## returns the updated data set. 
    def calcFamilySize(self, data):
        data["FamilySize"] = data["SibSp"] + data["Parch"] + 1
        return data
   
   #Calculating Fare per person using Family Size
   ##acts fare per person to data and returns data
    def farePerPerson(self, data):
       data["Fare_Per_Person"]=data["Fare"]/(data["FamilySize"]+1)
       return data
   
   #Turning cabin number into Deck
   #adds deck as feature and then returns data
    def cabintoDeck(self,data):
        cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
        data['Deck']=data['Cabin'].map(lambda x: self.substrings_in_string(str(x), cabin_list))
        ## Replace nulls with "Unknown"        
        data['Deck'] = data['Deck'].fillna("Unknown")
        return data
        
    def missingAge(self,data):
        age_glm = smf.glm(formula = 'Age \~ Title + FamilySize + Sex', df =data, 
                      datafamily=sm.families.Binomial()).fit()
        print age_glm.summary()

    ## Gets Correlation Matrix and returns of all columns
    def getCorr(self,data):
        corrMatrix = data.corr()
        return corrMatrix
    
    ## Adds family size, fare per family member, deck, and title to the given
    ## data set, then returns it    
    def addFeatures(self,data):
        data = self.calcFamilySize(data)
        data = self.farePerPerson(data)
        data = self.cabintoDeck(data)
        data = self.calcTitles(data)
        data['Embarked'].fillna('S')
        return data
    
    
    ## Removes Fare, Cabin, Name, and Ticket from the given data set, then 
    ## returns the dataset. These variables are removed because they are either 
    ## arbitrary strings, or represented in other variables. 
    def deleteFeatures(self, data):
        return data.drop("Fare",1).drop("Cabin",1).drop("Name",1).drop("Ticket",1).drop("SibSp",1).drop("Parch",1)
        
    
    ## Returns a list containing the column names or nominal variables
    def getNominalNames(self, data):
        Nominal  = self.dataAnalyzer.getNameOfNoms(data)
        return Nominal
        
    ## Takes in a Pandas DataFrame and a list of column names to be treated as 
    ## nominal variables, and returns a Pandas DataFrame. 
    ## For each column in the list, dummy variables will be added to the 
    ## DataFrame for each value in the column, then the column will be removed.
    def nominaltoDummy(self, data):
        nominals  = self.getNominalNames(data)
        return self.dqTool.convertCatsToDummies(data, nominals)
#        rt = da.RegressionTool(data, 0)
#        return rt.convertCatsToDummies(data, nominals)
        
    ## Completely preps the data for building models
    ## Takes one argument, a Pandas DataFrame
    ## Returns a DataFrame
    def prepData(self, data):
        if not isinstance(data, pd.DataFrame):
            print "\n" + "ERROR! Wrong type of data!" + "\n"
        else:
            data = self.addFeatures(data)
            data = self.deleteFeatures(data)
            data = self.nominaltoDummy(data)
            return data