def __do_one_hot_encodings(self):
        df_train, cv = self.res_data_dict[
            g_singletonDataFilePath.getTrainDir()]
        df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
        df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
        enc = OneHotEncoder(sparse=False)
        cross_feature_dict = self.__get_label_encode_dict()
        to_be_encoded = []
        for _, new_feature_name in cross_feature_dict.iteritems():
            to_be_encoded.append(new_feature_name)
        #fix all data source
        to_be_stacked_df = pd.concat([
            df_train[to_be_encoded], df_testset1[to_be_encoded],
            df_testset2[to_be_encoded]
        ],
                                     axis=0)
        enc.fit(to_be_stacked_df)

        enc, to_be_encoded = self.__filter_too_big_onehot_encoding(
            enc, to_be_encoded, df_train, df_testset1, df_testset2)
        # transform on seprate data source
        self.res_data_dict[g_singletonDataFilePath.getTrainDir(
        )] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded), cv
        self.res_data_dict[g_singletonDataFilePath.getTest1Dir(
        )] = self.__do_one_hot_encoding(df_testset1, enc, to_be_encoded)
        self.res_data_dict[g_singletonDataFilePath.getTest2Dir(
        )] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded)
        return
    def __init__(self):
        ExploreOrder.__init__(self)
        self.gapdf = self.load_gapdf(g_singletonDataFilePath.getTrainDir())
#         self.gap_time_dict = self.gapdf.groupby('time_slotid')['gap'].sum().to_dict()
        self.weathdf = self.load_weatherdf(g_singletonDataFilePath.getTrainDir())
#         self.trafficdf = self.load_trafficdf(g_singletonDataFilePath.getTrainDir())
#         self.gapDict = self.loadGapDict(g_singletonDataFilePath.getTrainDir() + 'temp/gap.csv.dict.pickle')
        return
 def __init__(self):
     ExploreOrder.__init__(self)
     self.gapdf = self.load_gapdf(g_singletonDataFilePath.getTrainDir())
     #         self.gap_time_dict = self.gapdf.groupby('time_slotid')['gap'].sum().to_dict()
     self.weathdf = self.load_weatherdf(
         g_singletonDataFilePath.getTrainDir())
     #         self.trafficdf = self.load_trafficdf(g_singletonDataFilePath.getTrainDir())
     #         self.gapDict = self.loadGapDict(g_singletonDataFilePath.getTrainDir() + 'temp/gap.csv.dict.pickle')
     return
 def disp_gap_bytraffic(self):
     df = self.gapdf
     data_dir = g_singletonDataFilePath.getTrainDir()
     dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevtraffic.df.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         temp_df = dumpload.load()
     else:
         traffic_dict = self.get_traffic_dict(data_dir)
         
         temp_df = self.X_y_Df[['start_district_id', 'time_slotid']].apply(self.find_prev_traffic,axis = 1, traffic_dict=traffic_dict, pre_num = 3)   
         dumpload.dump(temp_df)
         
     df = pd.concat([df, temp_df],  axis=1)
  
     
     by_traffic = df.groupby('traffic1')
     x=[]
     y=[]
     for name, group in by_traffic:
         x.append(name)
         y.append(group['gap'].mean())
     plt.scatter(x,y)
     
     return
    def disp_gap_bytraffic(self):
        df = self.gapdf
        data_dir = g_singletonDataFilePath.getTrainDir()
        dumpfile_path = '../data_preprocessed/' + data_dir.split(
            '/')[-2] + '_prevtraffic.df.pickle'
        dumpload = DumpLoad(dumpfile_path)
        if dumpload.isExisiting():
            temp_df = dumpload.load()
        else:
            traffic_dict = self.get_traffic_dict(data_dir)

            temp_df = self.X_y_Df[['start_district_id', 'time_slotid'
                                   ]].apply(self.find_prev_traffic,
                                            axis=1,
                                            traffic_dict=traffic_dict,
                                            pre_num=3)
            dumpload.dump(temp_df)

        df = pd.concat([df, temp_df], axis=1)

        by_traffic = df.groupby('traffic1')
        x = []
        y = []
        for name, group in by_traffic:
            x.append(name)
            y.append(group['gap'].mean())
        plt.scatter(x, y)

        return
    def get_history_data_dict(self):
        """
        indexes for quick search
        key = 'start_district_id','time_id'
        value = 'gap
        its data includes those from train, test1, test2.
        """
        t0 = time()

        filename = "../data_preprocessed/"  + 'traintest_history_data.dict.pickle'
        
        dumpload = DumpLoad( filename)
        if dumpload.isExisiting():
            return dumpload.load()
        
        test1data_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTest1Dir())
        test2data_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTest2Dir())
        traindata_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTrainDir())
        
        
        df = pd.concat([traindata_df, test1data_df,test2data_df],  axis=0)
        self.__fileter_earlier_date(df)
        res_dict = self.__generate_dict(df)            
       
        dumpload.dump(res_dict)
        print "dump weather dict:", round(time()-t0, 3), "s"
        return  res_dict
Beispiel #7
0
    def disp_gap_by_district_type(self):
        df = self.gapdf
        data_dir = g_singletonDataFilePath.getTrainDir()
        
        dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_poi.df.pickle'
        dumpload = DumpLoad(dumpfile_path)
        if dumpload.isExisiting():
            temp_df = dumpload.load()
        else:
            poi_dict = self.get_district_type_dict()
            
            temp_df = self.X_y_Df[['start_district_id']].apply(self.find_poi,axis = 1, poi_dict=poi_dict)
            
            dumpload.dump(temp_df)
        df = pd.concat([df, temp_df],  axis=1)
        dt_list = self.get_district_type_list()
        
        size = len(dt_list)
        col_len = 4
        row_len = 7
#         col_len = row_len = int(math.ceil(math.sqrt(size)))
#         count = 1
        _, axarr = plt.subplots(row_len, col_len, sharex=True, sharey=True)
        for row in range(row_len):
            for col in range(col_len):
                index = row * col_len + col
                if index >= size:
                    break
                item = dt_list[index]
                axarr[row, col].scatter(df[item], df['gap'])
                axarr[row, col].set_ylabel('Gap')
                axarr[row, col].set_xlabel(item)


        return
    def __unittest(self):
        #         self.combine_all_csv(g_singletonDataFilePath.getTrainDir() + 'weather_data/temp/', 'weather_', 'weather.csv')
        #         self.save_one_csv(g_singletonDataFilePath.getTrainDir() + 'traffic_data/traffic_data_2016-01-04')
        #         weatherdf = self.load_weatherdf(g_singletonDataFilePath.getTrainDir())
        data_dir = g_singletonDataFilePath.getTrainDir()
        traffic_dict = self.get_traffic_dict(data_dir)
        assert [0, 0,
                0] == self.find_prev_traffic(pd.Series([1, '2016-01-01-2']),
                                             traffic_dict=traffic_dict,
                                             pre_num=3).tolist()
        assert [2246,
                2081] == self.find_prev_traffic(pd.Series([1, '2016-01-01-9']),
                                                traffic_dict=traffic_dict,
                                                pre_num=2).tolist()

        data_dir = g_singletonDataFilePath.getTest1Dir()
        traffic_dict = self.get_traffic_dict(data_dir)
        assert [346, 424,
                0] == self.find_prev_traffic(pd.Series([66, '2016-01-30-141']),
                                             traffic_dict=traffic_dict,
                                             pre_num=3).tolist()
        assert [501, 484,
                447] == self.find_prev_traffic(pd.Series([66,
                                                          '2016-01-30-70']),
                                               traffic_dict=traffic_dict,
                                               pre_num=3).tolist()
        assert [772, 802,
                775] == self.find_prev_traffic(pd.Series([57,
                                                          '2016-01-24-58']),
                                               traffic_dict=traffic_dict,
                                               pre_num=3).tolist()

        print 'passed unit test'

        return
    def getFeaturesLabel(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        self.__do_prepare_data()
        df, cv = self.res_data_dict[data_dir]
        return df[self.get_used_features()], df[self.usedLabel], cv

        return
 def __init__(self):
     ExploreOrder.__init__(self)
     self.gap_testdf = self.load_gapdf(g_singletonDataFilePath.getTest1Dir())
     self.gap_traindf = self.load_gapdf(g_singletonDataFilePath.getTrainDir())
     self.gap_traindf.describe()
     self.gap_testdf.describe()
     return
Beispiel #11
0
    def get_history_data_dict(self):
        """
        indexes for quick search
        key = 'start_district_id','time_id'
        value = 'gap
        its data includes those from train, test1, test2.
        """
        t0 = time()

        filename = "../data_preprocessed/" + 'traintest_history_data.dict.pickle'

        dumpload = DumpLoad(filename)
        if dumpload.isExisiting():
            return dumpload.load()

        test1data_df = ExploreOrder().load_gapdf(
            g_singletonDataFilePath.getTest1Dir())
        test2data_df = ExploreOrder().load_gapdf(
            g_singletonDataFilePath.getTest2Dir())
        traindata_df = ExploreOrder().load_gapdf(
            g_singletonDataFilePath.getTrainDir())

        df = pd.concat([traindata_df, test1data_df, test2data_df], axis=0)
        self.__fileter_earlier_date(df)
        res_dict = self.__generate_dict(df)

        dumpload.dump(res_dict)
        print "dump weather dict:", round(time() - t0, 3), "s"
        return res_dict
 def getFeaturesLabel(self):
     data_dir = g_singletonDataFilePath.getTrainDir()
     self.__do_prepare_data()
     df, cv = self.res_data_dict[data_dir]
     return df[self.get_used_features()], df[self.usedLabel],cv
     
     
     return
 def __save_final_data(self):
     df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
     df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
     df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
     df_train.to_csv('temp/df_train_final.csv')
     df_testset1.to_csv('temp/df_testset1_final.csv')
     df_testset2.to_csv('temp/df_testset2_final.csv')
     return
 def __save_final_data(self):
     df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
     df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
     df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
     df_train.to_csv('temp/df_train_final.csv')
     df_testset1.to_csv('temp/df_testset1_final.csv')
     df_testset2.to_csv('temp/df_testset2_final.csv')
     return
Beispiel #15
0
 def __init__(self):
     ExploreOrder.__init__(self)
     self.gap_testdf = self.load_gapdf(
         g_singletonDataFilePath.getTest1Dir())
     self.gap_traindf = self.load_gapdf(
         g_singletonDataFilePath.getTrainDir())
     self.gap_traindf.describe()
     self.gap_testdf.describe()
     return
Beispiel #16
0
    def weather_distribution(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        self.gapdf = self.load_weatherdf(data_dir)
        print (self.gapdf['weather'].describe())
#         sns.distplot(self.gapdf['gap'],kde=False, bins=100);
        
        sns.countplot(x="weather", data=self.gapdf, palette="Greens_d");
        plt.title('Countplot of Weather')

        return
    def gapdistricution(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        self.gapdf = self.load_gapdf(data_dir)
        print self.gapdf['gap'].describe()
#         sns.distplot(self.gapdf['gap'],kde=False, bins=100);
        self.gapdf['gap'].plot(kind='hist', bins=200)
        plt.xlabel('Gaps')
        plt.title('Histogram of Gaps')

        return
    def gapdistricution(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        self.gapdf = self.load_gapdf(data_dir)
        print self.gapdf['gap'].describe()
        #         sns.distplot(self.gapdf['gap'],kde=False, bins=100);
        self.gapdf['gap'].plot(kind='hist', bins=200)
        plt.xlabel('Gaps')
        plt.title('Histogram of Gaps')

        return
    def traffic_districution(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        df = self.load_trafficdf(data_dir)
        print df['traffic'].describe()
#         sns.distplot(self.gapdf['gap'],kde=False, bins=100);
        df['traffic'].plot(kind='hist', bins=100)
        plt.xlabel('Traffic')
        plt.title('Histogram of Traffic')

        return
    def traffic_districution(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        df = self.load_trafficdf(data_dir)
        print df['traffic'].describe()
        #         sns.distplot(self.gapdf['gap'],kde=False, bins=100);
        df['traffic'].plot(kind='hist', bins=100)
        plt.xlabel('Traffic')
        plt.title('Histogram of Traffic')

        return
 def __do_one_hot_encodings(self):
     df_train, cv = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
     df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
     df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
     enc = OneHotEncoder(sparse=False)
     cross_feature_dict = self.__get_label_encode_dict()
     to_be_encoded = []
     for _, new_feature_name in cross_feature_dict.iteritems():
         to_be_encoded.append(new_feature_name)
     #fix all data source
     to_be_stacked_df = pd.concat([df_train[to_be_encoded], df_testset1[to_be_encoded], df_testset2[to_be_encoded]], axis = 0)
     enc.fit(to_be_stacked_df)
     
     enc, to_be_encoded = self.__filter_too_big_onehot_encoding(enc, to_be_encoded, df_train, df_testset1, df_testset2)
     # transform on seprate data source
     self.res_data_dict[g_singletonDataFilePath.getTrainDir()] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded),cv
     self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] = self.__do_one_hot_encoding(df_testset1,enc, to_be_encoded)
     self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded)
     return
    def weather_distribution(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        self.gapdf = self.load_weatherdf(data_dir)
        print self.gapdf['weather'].describe()
#         sns.distplot(self.gapdf['gap'],kde=False, bins=100);
        
        sns.countplot(x="weather", data=self.gapdf, palette="Greens_d");
        plt.title('Countplot of Weather')
#         self.gapdf['weather'].plot(kind='bar')
#         plt.xlabel('Weather')
#         plt.title('Histogram of Weather')
        return
 def get_train_validationset(self):
     data_dir = g_singletonDataFilePath.getTrainDir()
     self.__do_prepare_data()
     df, cv = self.res_data_dict[data_dir]
     folds = []
     for train_index, test_index in cv:
         folds.append((train_index, test_index))
     train_index = folds[self.train_validation_foldid][0]
     test_index = folds[self.train_validation_foldid][1]
     X_train = df.iloc[train_index][self.get_used_features()]
     y_train = df.iloc[train_index][self.usedLabel]     
     X_test =  df.iloc[test_index][self.get_used_features()]
     y_test =  df.iloc[test_index][self.usedLabel]
     return  X_train, y_train,X_test,y_test
    def __get_feature_label(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        self.X_y_Df = self.load_gapdf(data_dir) 
        self.__engineer_feature(data_dir)

        if self.holdout_split == HoldoutSplitMethod.kFOLD_FORWARD_CHAINING:
            cv = self.get_kfold_forward_chaining(self.X_y_Df)
        elif self.holdout_split == HoldoutSplitMethod.KFOLD_BYDATE:
            cv = self.get_kfold_bydate(self.X_y_Df)
        else:
            cv = self.get_imitate_testset2(self.X_y_Df, split_method = self.holdout_split)
        
        self.res_data_dict[data_dir] = self.X_y_Df,cv
        return
 def __do_label_encoding(self):
     df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
     df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
     df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
     le = LabelEncoder()
     cross_feature_dict = self.__get_label_encode_dict()
     for _, new_feature_name in cross_feature_dict.iteritems():
         to_be_stacked = [df_train[new_feature_name], df_testset1[new_feature_name], df_testset2[new_feature_name]]
         le.fit(pd.concat(to_be_stacked, axis=0))
         df_train[new_feature_name] = le.transform(df_train[new_feature_name])
         df_testset1[new_feature_name] = le.transform(df_testset1[new_feature_name])
         df_testset2[new_feature_name] = le.transform(df_testset2[new_feature_name])
         
     return 
    def __unittest(self):
        #         self.combine_all_csv(g_singletonDataFilePath.getTrainDir() + 'weather_data/temp/', 'weather_', 'weather.csv')
#         self.save_one_csv(g_singletonDataFilePath.getTrainDir() + 'weather_data/weather_data_2016-01-02')
#         weatherdf = self.load_weatherdf(g_singletonDataFilePath.getTrainDir())
        weather_dict = self.get_weather_dict(g_singletonDataFilePath.getTrainDir())
        assert  2== self.find_prev_weather_mode('2016-01-01-1', weather_dict = weather_dict)[0]
        assert  2== self.find_prev_weather_mode('2016-01-21-144', weather_dict = weather_dict)[0]
#         
        assert  2== self.find_prev_weather_mode('2016-01-21-115', weather_dict = weather_dict)[0]
        assert  2== self.find_prev_weather_mode('2016-01-21-114', weather_dict = weather_dict)[0]
        print 'passed unit test'
        
        
        return
 def get_train_validationset(self):
     data_dir = g_singletonDataFilePath.getTrainDir()
     self.__do_prepare_data()
     df, cv = self.res_data_dict[data_dir]
     folds = []
     for train_index, test_index in cv:
         folds.append((train_index, test_index))
     train_index = folds[self.train_validation_foldid][0]
     test_index = folds[self.train_validation_foldid][1]
     X_train = df.iloc[train_index][self.get_used_features()]
     y_train = df.iloc[train_index][self.usedLabel]
     X_test = df.iloc[test_index][self.get_used_features()]
     y_test = df.iloc[test_index][self.usedLabel]
     return X_train, y_train, X_test, y_test
    def __get_feature_label(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        self.X_y_Df = self.load_gapdf(data_dir)
        self.__engineer_feature(data_dir)

        if self.holdout_split == HoldoutSplitMethod.kFOLD_FORWARD_CHAINING:
            cv = self.get_kfold_forward_chaining(self.X_y_Df)
        elif self.holdout_split == HoldoutSplitMethod.KFOLD_BYDATE:
            cv = self.get_kfold_bydate(self.X_y_Df)
        else:
            cv = self.get_imitate_testset2(self.X_y_Df,
                                           split_method=self.holdout_split)

        self.res_data_dict[data_dir] = self.X_y_Df, cv
        return
    def unitTest(self):
        # test cases for find_prev_gap
        data_dir = g_singletonDataFilePath.getTrainDir()
        gap_dict = self.get_gap_dict(data_dir)
        assert [3096, 1698, 318, 33, 0,
                0] == self.find_prev_gap(pd.Series([51, '2016-01-01-5']),
                                         pre_num=6,
                                         gap_dict=gap_dict).tolist()
        assert [0, 0, 0] == self.find_prev_gap(pd.Series([45, '2016-01-16-2']),
                                               pre_num=3,
                                               gap_dict=gap_dict).tolist()
        assert [24, 26,
                37] == self.find_prev_gap(pd.Series([53, '2016-01-04-56']),
                                          pre_num=3,
                                          gap_dict=gap_dict).tolist()

        data_dir = g_singletonDataFilePath.getTest1Dir()
        gap_dict = self.get_gap_dict(data_dir)
        assert [0, 1, 0] == self.find_prev_gap(pd.Series([54,
                                                          '2016-01-24-81']),
                                               pre_num=3,
                                               gap_dict=gap_dict).tolist()
        assert [6, 4, 0] == self.find_prev_gap(pd.Series([7,
                                                          '2016-01-30-141']),
                                               pre_num=3,
                                               gap_dict=gap_dict).tolist()
        assert [0, 0] == self.find_prev_gap(pd.Series([7, '2016-01-30-138']),
                                            pre_num=2,
                                            gap_dict=gap_dict).tolist()
        assert [0, 0, 0] == self.find_prev_gap(pd.Series([7,
                                                          '2016-01-30-139']),
                                               pre_num=3,
                                               gap_dict=gap_dict).tolist()
        assert [0, 0,
                1] == self.find_prev_gap(pd.Series([50, '2016-01-30-143']),
                                         pre_num=3,
                                         gap_dict=gap_dict).tolist()
        assert [245, 282,
                0] == self.find_prev_gap(pd.Series([51, '2016-01-22-141']),
                                         pre_num=3,
                                         gap_dict=gap_dict).tolist()

        gap_meanmedian_dict = self.get_gap_meanmedian_dict()
        self.find_gap_meanmedian(pd.Series([5, 55]),
                                 gap_meanmedian_dict=gap_meanmedian_dict)

        print("unit test passed")
        return
    def get_gap_meanmedian_dict(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        filename = data_dir + 'order_data/temp/gap_meanmedian.dict.pickle'
        dumpload = DumpLoad( filename)
        if dumpload.isExisiting():
            return dumpload.load()
        
        resDict = {}
        df = self.load_gapdf(data_dir)
        grps = df.groupby(['start_district_id','time_id'])
        for name, row in grps:
            resDict[name] = row['gap'].tolist()
#             resDict[name] = [i for i in row['gap'].tolist() if i !=0]
            
        
        dumpload.dump(resDict)
        return resDict 
Beispiel #31
0
    def __unittest(self):
        #         self.combine_all_csv(g_singletonDataFilePath.getTrainDir() + 'weather_data/temp/', 'weather_', 'weather.csv')
        #         self.save_one_csv(g_singletonDataFilePath.getTrainDir() + 'weather_data/weather_data_2016-01-02')
        #         weatherdf = self.load_weatherdf(g_singletonDataFilePath.getTrainDir())
        weather_dict = self.get_weather_dict(
            g_singletonDataFilePath.getTrainDir())
        assert 2 == self.find_prev_weather_mode('2016-01-01-1',
                                                weather_dict=weather_dict)[0]
        assert 2 == self.find_prev_weather_mode('2016-01-21-144',
                                                weather_dict=weather_dict)[0]
        #
        assert 2 == self.find_prev_weather_mode('2016-01-21-115',
                                                weather_dict=weather_dict)[0]
        assert 2 == self.find_prev_weather_mode('2016-01-21-114',
                                                weather_dict=weather_dict)[0]
        print 'passed unit test'

        return
    def get_gap_meanmedian_dict(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        filename = data_dir + 'order_data/temp/gap_meanmedian.dict.pickle'
        dumpload = DumpLoad(filename)
        if dumpload.isExisiting():
            return dumpload.load()

        resDict = {}
        df = self.load_gapdf(data_dir)
        grps = df.groupby(['start_district_id', 'time_id'])
        for name, row in grps:
            resDict[name] = row['gap'].tolist()


#             resDict[name] = [i for i in row['gap'].tolist() if i !=0]

        dumpload.dump(resDict)
        return resDict
    def disp_gap_by_district_type(self):
        df = self.gapdf
        data_dir = g_singletonDataFilePath.getTrainDir()
        
        dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_poi.df.pickle'
        dumpload = DumpLoad(dumpfile_path)
        if dumpload.isExisiting():
            temp_df = dumpload.load()
        else:
            poi_dict = self.get_district_type_dict()
            
            temp_df = self.X_y_Df[['start_district_id']].apply(self.find_poi,axis = 1, poi_dict=poi_dict)
            
            dumpload.dump(temp_df)
        df = pd.concat([df, temp_df],  axis=1)
        dt_list = self.get_district_type_list()
        
        size = len(dt_list)
        col_len = 4
        row_len = 7
#         col_len = row_len = int(math.ceil(math.sqrt(size)))
#         count = 1
        _, axarr = plt.subplots(row_len, col_len, sharex=True, sharey=True)
        for row in range(row_len):
            for col in range(col_len):
                index = row * col_len + col
                if index >= size:
                    break
                item = dt_list[index]
                axarr[row, col].scatter(df[item], df['gap'])
                axarr[row, col].set_ylabel('Gap')
                axarr[row, col].set_xlabel(item)
#                 axarr[row, col].set_title('POI/Gap Correlation')
#         for item in dt_list:
#             plt.subplot(row_len, col_len, count)
#             plt.scatter(df[item], df['gap'])
#             plt.ylabel('Gap')
#             plt.xlabel('POI')
#             count += 1
        
       
#         plt.title('POI/Gap Correlation')
        return
    def __do_label_encoding(self):
        df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
        df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
        df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
        le = LabelEncoder()
        cross_feature_dict = self.__get_label_encode_dict()
        for _, new_feature_name in cross_feature_dict.iteritems():
            to_be_stacked = [
                df_train[new_feature_name], df_testset1[new_feature_name],
                df_testset2[new_feature_name]
            ]
            le.fit(pd.concat(to_be_stacked, axis=0))
            df_train[new_feature_name] = le.transform(
                df_train[new_feature_name])
            df_testset1[new_feature_name] = le.transform(
                df_testset1[new_feature_name])
            df_testset2[new_feature_name] = le.transform(
                df_testset2[new_feature_name])

        return
 def disp_gap_byweather(self):
     df = self.gapdf
     data_dir = g_singletonDataFilePath.getTrainDir()
     dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevweather.df.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         temp_df = dumpload.load()
     else:
         weather_dict = self.get_weather_dict(data_dir)
         
         temp_df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode, weather_dict=weather_dict)     
         dumpload.dump(temp_df)
         
     df = pd.concat([df, temp_df],  axis=1)
     
     gaps_mean = df.groupby('preweather')['gap'].mean()
     gaps_mean.plot(kind='bar')
     plt.ylabel('Mean of gap')
     plt.xlabel('Weather')
     plt.title('Weather/Gap Correlation')
     return
Beispiel #36
0
 def disp_gap_byweather(self):
     df = self.gapdf
     data_dir = g_singletonDataFilePath.getTrainDir()
     dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevweather.df.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         temp_df = dumpload.load()
     else:
         weather_dict = self.get_weather_dict(data_dir)
         
         temp_df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode, weather_dict=weather_dict)     
         dumpload.dump(temp_df)
         
     df = pd.concat([df, temp_df],  axis=1)
     
     gaps_mean = df.groupby('preweather')['gap'].mean()
     gaps_mean.plot(kind='bar')
     plt.ylabel('Mean of gap')
     plt.xlabel('Weather')
     plt.title('Weather/Gap Correlation')
     return
    def unitTest(self):
        # test cases for find_prev_gap
        data_dir = g_singletonDataFilePath.getTrainDir()
        gap_dict = self.get_gap_dict(data_dir)
        assert [3096,1698,318,33,0,0] == self.find_prev_gap(pd.Series([51, '2016-01-01-5']), pre_num = 6, gap_dict = gap_dict).tolist()
        assert [0,0,0] == self.find_prev_gap(pd.Series([45, '2016-01-16-2']), pre_num = 3, gap_dict = gap_dict).tolist()
        assert [24,26,37] == self.find_prev_gap(pd.Series([53, '2016-01-04-56']), pre_num = 3, gap_dict = gap_dict).tolist()
        
        
        data_dir = g_singletonDataFilePath.getTest1Dir()
        gap_dict = self.get_gap_dict(data_dir)
        assert [0,1,0] == self.find_prev_gap(pd.Series([54, '2016-01-24-81']), pre_num = 3, gap_dict = gap_dict).tolist()
        assert [6,4,0] == self.find_prev_gap(pd.Series([7, '2016-01-30-141']), pre_num = 3, gap_dict = gap_dict).tolist()
        assert [0,0] == self.find_prev_gap(pd.Series([7, '2016-01-30-138']), pre_num = 2, gap_dict = gap_dict).tolist()
        assert [0,0,0] == self.find_prev_gap(pd.Series([7, '2016-01-30-139']), pre_num = 3, gap_dict = gap_dict).tolist()
        assert [0,0,1] == self.find_prev_gap(pd.Series([50, '2016-01-30-143']), pre_num = 3, gap_dict = gap_dict).tolist()
        assert [245,282,0] == self.find_prev_gap(pd.Series([51, '2016-01-22-141']), pre_num = 3, gap_dict = gap_dict).tolist()
        
        gap_meanmedian_dict = self.get_gap_meanmedian_dict()
        self.find_gap_meanmedian(pd.Series([5,55]),gap_meanmedian_dict = gap_meanmedian_dict)

        print "unit test passed"
        return
        res.append(item)
        
        # training 1-19, validation 19-21
#         item = self.__get_train_validation_indexes(df, '2016-01-01', 19, split_method), self.__get_train_validation_indexes(df, '2016-01-20', 2)
#         res.append(item)
#         
#         # training 1-20, validation 21
#         item = self.__get_train_validation_indexes(df, '2016-01-01', 20, split_method), self.__get_train_validation_indexes(df, '2016-01-21', 1)
#         res.append(item)
        return res
    def __get_train_validation_indexes(self,df, start_date, days_num, split_method = HoldoutSplitMethod.IMITTATE_TEST2_MIN):
        dates = self.__get_date(start_date, days_num, days_step=1)
        slots = self.__get_slots(split_method) 
        dates_slots = self.__get_date_slots(dates, slots)
        indexes = self.__get_df_indexes(df, dates_slots)
        return indexes
    def run(self, df):
        self.__unit_test()
#         self.get_kfold_bydate(df)
#         self.get_kfold_forward_chaining(df)
        return
    

if __name__ == "__main__":   
    obj= SplitTrainValidation()
    from  preparedata import PrepareData
    from utility.datafilepath import g_singletonDataFilePath
    pre = PrepareData()
    pre.X_y_Df = pre.load_gapdf(g_singletonDataFilePath.getTrainDir())
    pre.__engineer_feature(g_singletonDataFilePath.getTrainDir())
    obj.run(pre.X_y_Df)
Beispiel #39
0
        #         item = self.__get_train_validation_indexes(df, '2016-01-01', 20, split_method), self.__get_train_validation_indexes(df, '2016-01-21', 1)
        #         res.append(item)
        return res

    def __get_train_validation_indexes(
            self,
            df,
            start_date,
            days_num,
            split_method=HoldoutSplitMethod.IMITTATE_TEST2_MIN):
        dates = self.__get_date(start_date, days_num, days_step=1)
        slots = self.__get_slots(split_method)
        dates_slots = self.__get_date_slots(dates, slots)
        indexes = self.__get_df_indexes(df, dates_slots)
        return indexes

    def run(self, df):
        self.__unit_test()
        #         self.get_kfold_bydate(df)
        #         self.get_kfold_forward_chaining(df)
        return


if __name__ == "__main__":
    obj = SplitTrainValidation()
    from preparedata import PrepareData
    from utility.datafilepath import g_singletonDataFilePath
    pre = PrepareData()
    pre.X_y_Df = pre.load_gapdf(g_singletonDataFilePath.getTrainDir())
    pre.__engineer_feature(g_singletonDataFilePath.getTrainDir())
    obj.run(pre.X_y_Df)