def __do_one_hot_encodings(self):
        df_train, cv = self.res_data_dict[
            g_singletonDataFilePath.getTrainDir()]
        df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
        df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
        enc = OneHotEncoder(sparse=False)
        cross_feature_dict = self.__get_label_encode_dict()
        to_be_encoded = []
        for _, new_feature_name in cross_feature_dict.iteritems():
            to_be_encoded.append(new_feature_name)
        #fix all data source
        to_be_stacked_df = pd.concat([
            df_train[to_be_encoded], df_testset1[to_be_encoded],
            df_testset2[to_be_encoded]
        ],
                                     axis=0)
        enc.fit(to_be_stacked_df)

        enc, to_be_encoded = self.__filter_too_big_onehot_encoding(
            enc, to_be_encoded, df_train, df_testset1, df_testset2)
        # transform on seprate data source
        self.res_data_dict[g_singletonDataFilePath.getTrainDir(
        )] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded), cv
        self.res_data_dict[g_singletonDataFilePath.getTest1Dir(
        )] = self.__do_one_hot_encoding(df_testset1, enc, to_be_encoded)
        self.res_data_dict[g_singletonDataFilePath.getTest2Dir(
        )] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded)
        return
Esempio n. 2
0
    def get_history_data_dict(self):
        """
        indexes for quick search
        key = 'start_district_id','time_id'
        value = 'gap
        its data includes those from train, test1, test2.
        """
        t0 = time()

        filename = "../data_preprocessed/" + 'traintest_history_data.dict.pickle'

        dumpload = DumpLoad(filename)
        if dumpload.isExisiting():
            return dumpload.load()

        test1data_df = ExploreOrder().load_gapdf(
            g_singletonDataFilePath.getTest1Dir())
        test2data_df = ExploreOrder().load_gapdf(
            g_singletonDataFilePath.getTest2Dir())
        traindata_df = ExploreOrder().load_gapdf(
            g_singletonDataFilePath.getTrainDir())

        df = pd.concat([traindata_df, test1data_df, test2data_df], axis=0)
        self.__fileter_earlier_date(df)
        res_dict = self.__generate_dict(df)

        dumpload.dump(res_dict)
        print "dump weather dict:", round(time() - t0, 3), "s"
        return res_dict
Esempio n. 3
0
 def run(self):
     self.__unittest()
     data_dir = g_singletonDataFilePath.getTest2Dir()
     #         self.save_all_csv( data_dir+ 'weather_data/')
     #         self.combine_all_csv(data_dir + 'weather_data/temp/', 'weather_', 'weather.csv')
     self.get_weather_dict(data_dir)
     return
    def get_history_data_dict(self):
        """
        indexes for quick search
        key = 'start_district_id','time_id'
        value = 'gap
        its data includes those from train, test1, test2.
        """
        t0 = time()

        filename = "../data_preprocessed/"  + 'traintest_history_data.dict.pickle'
        
        dumpload = DumpLoad( filename)
        if dumpload.isExisiting():
            return dumpload.load()
        
        test1data_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTest1Dir())
        test2data_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTest2Dir())
        traindata_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTrainDir())
        
        
        df = pd.concat([traindata_df, test1data_df,test2data_df],  axis=0)
        self.__fileter_earlier_date(df)
        res_dict = self.__generate_dict(df)            
       
        dumpload.dump(res_dict)
        print "dump weather dict:", round(time()-t0, 3), "s"
        return  res_dict
    def run(self):
        self.__unittest()
        data_dir = g_singletonDataFilePath.getTest2Dir()
#         self.save_all_csv( data_dir+ 'weather_data/')
#         self.combine_all_csv(data_dir + 'weather_data/temp/', 'weather_', 'weather.csv')
        self.get_weather_dict(data_dir)
        return
 def run(self):
     #         self.__unittest()
     data_dir = g_singletonDataFilePath.getTest2Dir()
     #         self.save_all_csv(data_dir+ 'traffic_data/')
     #         self.combine_all_csv(data_dir + 'traffic_data/temp/', 'traffic_', 'traffic.csv')
     self.get_traffic_dict(data_dir)
     return
 def save_model(self):
     if not self.save_final_model:
         return
     dumpload = DumpLoad('logs/' + self.application_start_time + '_estimator.pickle')
     dumpload.dump(self)
     self.predictTestSet(g_singletonDataFilePath.getTest2Dir())
     return
Esempio n. 8
0
 def save_model(self):
     if not self.save_final_model:
         return
     dumpload = DumpLoad('logs/' + self.application_start_time +
                         '_estimator.pickle')
     dumpload.dump(self)
     self.predictTestSet(g_singletonDataFilePath.getTest2Dir())
     return
 def __save_final_data(self):
     df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
     df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
     df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
     df_train.to_csv('temp/df_train_final.csv')
     df_testset1.to_csv('temp/df_testset1_final.csv')
     df_testset2.to_csv('temp/df_testset2_final.csv')
     return
    def run(self):
        self.getFeaturesLabel()
        self.getFeaturesforTestSet(g_singletonDataFilePath.getTest2Dir())
        self.getFeaturesforTestSet(g_singletonDataFilePath.getTest1Dir())
        self.get_train_validationset()
        self.__save_final_data()

        return
 def __save_final_data(self):
     df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
     df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
     df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
     df_train.to_csv('temp/df_train_final.csv')
     df_testset1.to_csv('temp/df_testset1_final.csv')
     df_testset2.to_csv('temp/df_testset2_final.csv')
     return
    def run(self):
        self.getFeaturesLabel()
        self.getFeaturesforTestSet(g_singletonDataFilePath.getTest2Dir())
        self.getFeaturesforTestSet(g_singletonDataFilePath.getTest1Dir())
        self.get_train_validationset()
        self.__save_final_data()


        return
 def __do_one_hot_encodings(self):
     df_train, cv = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
     df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
     df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
     enc = OneHotEncoder(sparse=False)
     cross_feature_dict = self.__get_label_encode_dict()
     to_be_encoded = []
     for _, new_feature_name in cross_feature_dict.iteritems():
         to_be_encoded.append(new_feature_name)
     #fix all data source
     to_be_stacked_df = pd.concat([df_train[to_be_encoded], df_testset1[to_be_encoded], df_testset2[to_be_encoded]], axis = 0)
     enc.fit(to_be_stacked_df)
     
     enc, to_be_encoded = self.__filter_too_big_onehot_encoding(enc, to_be_encoded, df_train, df_testset1, df_testset2)
     # transform on seprate data source
     self.res_data_dict[g_singletonDataFilePath.getTrainDir()] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded),cv
     self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] = self.__do_one_hot_encoding(df_testset1,enc, to_be_encoded)
     self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded)
     return
Esempio n. 14
0
    def run(self):
        self.unitTest()
        data_dir = g_singletonDataFilePath.getTest2Dir()
#         self.get_gap_meanmedian_dict()
#         self.saveAllGapCsv(data_dir)
#         self.combine_gap_csv(data_dir)
        df = self.load_gapdf(data_dir)
        res = self.get_outlier_threshold(df['gap'])
#         self.get_gap_dict(data_dir)
        
        return
    def __do_prepare_data(self):
        if len(self.res_data_dict) != 0:
            # the data has already been preprocessed
            return
        self.__get_feature_label()
        self.__get_feature_for_test_set(g_singletonDataFilePath.getTest2Dir())
        self.__get_feature_for_test_set(g_singletonDataFilePath.getTest1Dir())
        self.__do_label_encoding()
        self.__do_one_hot_encodings()

        return
Esempio n. 16
0
    def run(self):
        self.unitTest()
        data_dir = g_singletonDataFilePath.getTest2Dir()
        #         self.get_gap_meanmedian_dict()
        #         self.saveAllGapCsv(data_dir)
        #         self.combine_gap_csv(data_dir)
        df = self.load_gapdf(data_dir)
        res = self.get_outlier_threshold(df['gap'])
        #         self.get_gap_dict(data_dir)

        return
    def __do_prepare_data(self):
        if len(self.res_data_dict) != 0:
            # the data has already been preprocessed
            return
        self.__get_feature_label()
        self.__get_feature_for_test_set(g_singletonDataFilePath.getTest2Dir())
        self.__get_feature_for_test_set(g_singletonDataFilePath.getTest1Dir())
        self.__do_label_encoding()
        self.__do_one_hot_encodings()

        return
 def __do_label_encoding(self):
     df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
     df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
     df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
     le = LabelEncoder()
     cross_feature_dict = self.__get_label_encode_dict()
     for _, new_feature_name in cross_feature_dict.iteritems():
         to_be_stacked = [df_train[new_feature_name], df_testset1[new_feature_name], df_testset2[new_feature_name]]
         le.fit(pd.concat(to_be_stacked, axis=0))
         df_train[new_feature_name] = le.transform(df_train[new_feature_name])
         df_testset1[new_feature_name] = le.transform(df_testset1[new_feature_name])
         df_testset2[new_feature_name] = le.transform(df_testset2[new_feature_name])
         
     return 
    def __do_label_encoding(self):
        df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
        df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
        df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
        le = LabelEncoder()
        cross_feature_dict = self.__get_label_encode_dict()
        for _, new_feature_name in cross_feature_dict.iteritems():
            to_be_stacked = [
                df_train[new_feature_name], df_testset1[new_feature_name],
                df_testset2[new_feature_name]
            ]
            le.fit(pd.concat(to_be_stacked, axis=0))
            df_train[new_feature_name] = le.transform(
                df_train[new_feature_name])
            df_testset1[new_feature_name] = le.transform(
                df_testset1[new_feature_name])
            df_testset2[new_feature_name] = le.transform(
                df_testset2[new_feature_name])

        return