Beispiel #1
0
    def fit_transform(self, df, k=100):
        '''
        INPUT: pandas dataframe, int
        OUTPUT: pandas dataframe

        Fits internal variables used to fill missing values. Returns
        transformed data ready for machine learning models.
        '''
        if self.method == 'mode':
            self.fit_most_common_values(df)
        elif self.method == 'kmeans':
            self.max_age = max(df['Age'].values)
            self.max_delta_h = max(df['Height (hh)']) - min(df['Height (hh)'])
            print self.max_age, max(df['Height (hh)']), min(df['Height (hh)'])
            kmeans = Kmeans_DF(df, k, self.horse_horse_distance)
            self.centroids = kmeans.get_centroids()
            print 'centroids fitted...'
        return self.transform(df)
    N = 8.
    for col in ['Breed', 'Color', 'Pedigree', 'Sex']:
        if (pd.isnull(series1[col])) or (pd.isnull(series2[col])):
            N -= 1
        elif series1[col] != series2[col]:
            dist += 1
    if not (pd.isnull(series1['Height (hh)']) or
            pd.isnull(series2['Height (hh)'])):
        dist += abs(series1['Height (hh)'] - series2['Height (hh)'])\
                / max_delta_h
    else:
        N -= 1
    if not (pd.isnull(series1['Temperament']) or
            pd.isnull(series2['Temperament'])):
        dist += abs(series1['Temperament'] - series2['Temperament'])
    else:
        N -= 1
    if not (pd.isnull(series1['Age']) or pd.isnull(series2['Age'])):
        dist += abs(series1['Age'] - series2['Age']) / max_age
    else:
        N -= 1
    return (dist / N - 1. / 8) * 8. / 7

if __name__ == '__main__':
    for k in [50, 100, 200]:
        filename = 'Centroids_' + str(k) + '.json'
        kmeans = Kmeans_DF(df, k, horse_horse_distance)
        centroids = kmeans.get_centroids()
        centroids.to_json(filename)
        print(str(k) + ' is done...')