Beispiel #1
0
    def disp_gap_by_district_type(self):
        df = self.gapdf
        data_dir = g_singletonDataFilePath.getTrainDir()
        
        dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_poi.df.pickle'
        dumpload = DumpLoad(dumpfile_path)
        if dumpload.isExisiting():
            temp_df = dumpload.load()
        else:
            poi_dict = self.get_district_type_dict()
            
            temp_df = self.X_y_Df[['start_district_id']].apply(self.find_poi,axis = 1, poi_dict=poi_dict)
            
            dumpload.dump(temp_df)
        df = pd.concat([df, temp_df],  axis=1)
        dt_list = self.get_district_type_list()
        
        size = len(dt_list)
        col_len = 4
        row_len = 7
#         col_len = row_len = int(math.ceil(math.sqrt(size)))
#         count = 1
        _, axarr = plt.subplots(row_len, col_len, sharex=True, sharey=True)
        for row in range(row_len):
            for col in range(col_len):
                index = row * col_len + col
                if index >= size:
                    break
                item = dt_list[index]
                axarr[row, col].scatter(df[item], df['gap'])
                axarr[row, col].set_ylabel('Gap')
                axarr[row, col].set_xlabel(item)


        return
    def get_history_data_dict(self):
        """
        indexes for quick search
        key = 'start_district_id','time_id'
        value = 'gap
        its data includes those from train, test1, test2.
        """
        t0 = time()

        filename = "../data_preprocessed/"  + 'traintest_history_data.dict.pickle'
        
        dumpload = DumpLoad( filename)
        if dumpload.isExisiting():
            return dumpload.load()
        
        test1data_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTest1Dir())
        test2data_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTest2Dir())
        traindata_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTrainDir())
        
        
        df = pd.concat([traindata_df, test1data_df,test2data_df],  axis=0)
        self.__fileter_earlier_date(df)
        res_dict = self.__generate_dict(df)            
       
        dumpload.dump(res_dict)
        print "dump weather dict:", round(time()-t0, 3), "s"
        return  res_dict
 def disp_gap_bytraffic(self):
     df = self.gapdf
     data_dir = g_singletonDataFilePath.getTrainDir()
     dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevtraffic.df.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         temp_df = dumpload.load()
     else:
         traffic_dict = self.get_traffic_dict(data_dir)
         
         temp_df = self.X_y_Df[['start_district_id', 'time_slotid']].apply(self.find_prev_traffic,axis = 1, traffic_dict=traffic_dict, pre_num = 3)   
         dumpload.dump(temp_df)
         
     df = pd.concat([df, temp_df],  axis=1)
  
     
     by_traffic = df.groupby('traffic1')
     x=[]
     y=[]
     for name, group in by_traffic:
         x.append(name)
         y.append(group['gap'].mean())
     plt.scatter(x,y)
     
     return
    def disp_gap_bytraffic(self):
        df = self.gapdf
        data_dir = g_singletonDataFilePath.getTrainDir()
        dumpfile_path = '../data_preprocessed/' + data_dir.split(
            '/')[-2] + '_prevtraffic.df.pickle'
        dumpload = DumpLoad(dumpfile_path)
        if dumpload.isExisiting():
            temp_df = dumpload.load()
        else:
            traffic_dict = self.get_traffic_dict(data_dir)

            temp_df = self.X_y_Df[['start_district_id', 'time_slotid'
                                   ]].apply(self.find_prev_traffic,
                                            axis=1,
                                            traffic_dict=traffic_dict,
                                            pre_num=3)
            dumpload.dump(temp_df)

        df = pd.concat([df, temp_df], axis=1)

        by_traffic = df.groupby('traffic1')
        x = []
        y = []
        for name, group in by_traffic:
            x.append(name)
            y.append(group['gap'].mean())
        plt.scatter(x, y)

        return
Beispiel #5
0
    def __normalize(self):
        self.X_train = self.X_train.astype(np.float32)
        self.X_val = self.X_val.astype(np.float32)
        self.X_test = self.X_test.astype(np.float32)

        mean_image = np.mean(self.X_train, axis=0)
        std_image = np.std(self.X_train, axis=0)

        self.X_train = (self.X_train - mean_image) / std_image
        self.X_val = (self.X_val - mean_image) / std_image
        self.X_test = (self.X_test - mean_image) / std_image

        dumpload = DumpLoad('../data/meanstdimage.pickle')
        if not dumpload.isExisiting():
            dumpload.dump((mean_image, std_image))


#         self.X_test -= mean_image
#         self.X_train = ((self.X_train - 128)/128.0)
#         self.X_val = ((self.X_val - 128)/128.0)
#         self.X_test = ((self.X_val - 128)/128.0)

#         sc = MinMaxScaler()
#         sc.fit(self.X_train)
#         self.x_train= sc.transform(self.X_train)
#         self.X_val= sc.transform(self.X_val)
#         self.X_test= sc.transform(self.X_test)
        return
Beispiel #6
0
    def get_history_data_dict(self):
        """
        indexes for quick search
        key = 'start_district_id','time_id'
        value = 'gap
        its data includes those from train, test1, test2.
        """
        t0 = time()

        filename = "../data_preprocessed/" + 'traintest_history_data.dict.pickle'

        dumpload = DumpLoad(filename)
        if dumpload.isExisiting():
            return dumpload.load()

        test1data_df = ExploreOrder().load_gapdf(
            g_singletonDataFilePath.getTest1Dir())
        test2data_df = ExploreOrder().load_gapdf(
            g_singletonDataFilePath.getTest2Dir())
        traindata_df = ExploreOrder().load_gapdf(
            g_singletonDataFilePath.getTrainDir())

        df = pd.concat([traindata_df, test1data_df, test2data_df], axis=0)
        self.__fileter_earlier_date(df)
        res_dict = self.__generate_dict(df)

        dumpload.dump(res_dict)
        print "dump weather dict:", round(time() - t0, 3), "s"
        return res_dict
 def add_history_data(self,data_dir):
     dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_history_data.df.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         df = dumpload.load()
     else:
         temp_dict = self.get_history_data_dict()
         df = self.X_y_Df[['start_district_id', 'time_id']].apply(self.find_history_data, axis = 1, history_dict = temp_dict)
         dumpload.dump(df)
     self.X_y_Df = pd.concat([self.X_y_Df, df],  axis=1)
     return
 def add_district_gap_sum(self):
     dumpfile_path = '../data_preprocessed/' +'training_data_district_gap_sum.dict.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         district_gap_sum_dict = dumpload.load()
     else:
         district_gap_sum_dict = self.X_y_Df.groupby('start_district_id')['gap'].sum().to_dict()
         dumpload.dump(district_gap_sum_dict)
         
     self.X_y_Df["district_gap_sum"] = self.X_y_Df["start_district_id"].map(district_gap_sum_dict)
     return
 def add_pre_gaps(self, data_dir):
     dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevgap.df.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         df = dumpload.load()
     else:
         gap_dict = self.get_gap_dict(data_dir)
         df = self.X_y_Df[['start_district_id', 'time_slotid']].apply(self.find_prev_gap, axis = 1, pre_num = 3, gap_dict = gap_dict)
         dumpload.dump(df)
     self.X_y_Df = pd.concat([self.X_y_Df, df],  axis=1)
     return
 def __add_poi(self, data_dir):
     dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_poi.df.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         df = dumpload.load()
     else:
         poi_dict = self.get_district_type_dict()
         
         df = self.X_y_Df[['start_district_id']].apply(self.find_poi,axis = 1, poi_dict=poi_dict)
         
         dumpload.dump(df)
     self.X_y_Df = pd.concat([self.X_y_Df, df],  axis=1)
     return
    def add_district_gap_sum(self):
        dumpfile_path = '../data_preprocessed/' + 'training_data_district_gap_sum.dict.pickle'
        dumpload = DumpLoad(dumpfile_path)
        if dumpload.isExisiting():
            district_gap_sum_dict = dumpload.load()
        else:
            district_gap_sum_dict = self.X_y_Df.groupby(
                'start_district_id')['gap'].sum().to_dict()
            dumpload.dump(district_gap_sum_dict)

        self.X_y_Df["district_gap_sum"] = self.X_y_Df["start_district_id"].map(
            district_gap_sum_dict)
        return
 def add_prev_weather(self, data_dir):
     dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevweather.df.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         df = dumpload.load()
     else:
         weather_dict = self.get_weather_dict(data_dir)
         
         df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode, weather_dict=weather_dict)
                 
         dumpload.dump(df)
     self.X_y_Df = pd.concat([self.X_y_Df, df],  axis=1)
     self.add_rain_check()
     return
 def add_history_data(self, data_dir):
     dumpfile_path = '../data_preprocessed/' + data_dir.split(
         '/')[-2] + '_history_data.df.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         df = dumpload.load()
     else:
         temp_dict = self.get_history_data_dict()
         df = self.X_y_Df[['start_district_id',
                           'time_id']].apply(self.find_history_data,
                                             axis=1,
                                             history_dict=temp_dict)
         dumpload.dump(df)
     self.X_y_Df = pd.concat([self.X_y_Df, df], axis=1)
     return
 def add_pre_gaps(self, data_dir):
     dumpfile_path = '../data_preprocessed/' + data_dir.split(
         '/')[-2] + '_prevgap.df.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         df = dumpload.load()
     else:
         gap_dict = self.get_gap_dict(data_dir)
         df = self.X_y_Df[['start_district_id',
                           'time_slotid']].apply(self.find_prev_gap,
                                                 axis=1,
                                                 pre_num=3,
                                                 gap_dict=gap_dict)
         dumpload.dump(df)
     self.X_y_Df = pd.concat([self.X_y_Df, df], axis=1)
     return
    def add_prev_weather(self, data_dir):
        dumpfile_path = '../data_preprocessed/' + data_dir.split(
            '/')[-2] + '_prevweather.df.pickle'
        dumpload = DumpLoad(dumpfile_path)
        if dumpload.isExisiting():
            df = dumpload.load()
        else:
            weather_dict = self.get_weather_dict(data_dir)

            df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode,
                                                  weather_dict=weather_dict)

            dumpload.dump(df)
        self.X_y_Df = pd.concat([self.X_y_Df, df], axis=1)
        self.add_rain_check()
        return
    def __add_poi(self, data_dir):
        dumpfile_path = '../data_preprocessed/' + data_dir.split(
            '/')[-2] + '_poi.df.pickle'
        dumpload = DumpLoad(dumpfile_path)
        if dumpload.isExisiting():
            df = dumpload.load()
        else:
            poi_dict = self.get_district_type_dict()

            df = self.X_y_Df[['start_district_id']].apply(self.find_poi,
                                                          axis=1,
                                                          poi_dict=poi_dict)

            dumpload.dump(df)
        self.X_y_Df = pd.concat([self.X_y_Df, df], axis=1)
        return
    def get_traffic_dict(self, data_dir):
        t0 = time()
        filename = '../data_raw/' + data_dir.split(
            '/')[-2] + '_traffic.csv.dict.pickle'
        dumpload = DumpLoad(filename)
        if dumpload.isExisiting():
            return dumpload.load()

        resDict = {}
        df = self.load_trafficdf(data_dir)
        for _, row in df.iterrows():
            resDict[tuple(row[['start_district_id',
                               'time_slotid']].tolist())] = row['traffic']

        dumpload.dump(resDict)
        print "dump traffic dict:", round(time() - t0, 3), "s"
        return resDict
    def disp_names(self, sorted_inds, probabilities, include_background=True):
        dump_load = DumpLoad("../../data/imagenet/imagenet_labels_dict.pickle")
        if dump_load.isExisiting():
            names = dump_load.load()
        else:
            names = imagenet.create_readable_names_for_imagenet_labels()
            dump_load.dump(names)

        for i in range(5):
            index = sorted_inds[i]
            if include_background:
                print('Probability %0.2f%% => [%s]' %
                      (probabilities[index], names[index]))
            else:
                print('Probability %0.2f%% => [%s]' %
                      (probabilities[index], names[index + 1]))
        return
    def get_gap_meanmedian_dict(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        filename = data_dir + 'order_data/temp/gap_meanmedian.dict.pickle'
        dumpload = DumpLoad( filename)
        if dumpload.isExisiting():
            return dumpload.load()
        
        resDict = {}
        df = self.load_gapdf(data_dir)
        grps = df.groupby(['start_district_id','time_id'])
        for name, row in grps:
            resDict[name] = row['gap'].tolist()
#             resDict[name] = [i for i in row['gap'].tolist() if i !=0]
            
        
        dumpload.dump(resDict)
        return resDict 
    def get_gap_meanmedian_dict(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        filename = data_dir + 'order_data/temp/gap_meanmedian.dict.pickle'
        dumpload = DumpLoad(filename)
        if dumpload.isExisiting():
            return dumpload.load()

        resDict = {}
        df = self.load_gapdf(data_dir)
        grps = df.groupby(['start_district_id', 'time_id'])
        for name, row in grps:
            resDict[name] = row['gap'].tolist()


#             resDict[name] = [i for i in row['gap'].tolist() if i !=0]

        dumpload.dump(resDict)
        return resDict
 def get_weather_dict(self,data_dir):
     t0 = time()
     filename = '../data_raw/' + data_dir.split('/')[-2] + '_weather.csv.dict.pickle'
     dumpload = DumpLoad( filename)
     if dumpload.isExisiting():
         return dumpload.load()
     
     resDict = {}
     df = self.load_weatherdf(data_dir)
     for index, row in df.iterrows():
         resDict[row['time_slotid']] = (index, row['weather'], row['temparature'], row['pm25'])
     for name, group in df.groupby('time_date'):
         resDict[name] = (-1, mode(group['weather'])[0][0], mode(group['temparature'])[0][0], mode(group['pm25'])[0][0])
         
    
     dumpload.dump(resDict)
     print "dump weather dict:", round(time()-t0, 3), "s"
     return resDict
    def disp_gap_by_district_type(self):
        df = self.gapdf
        data_dir = g_singletonDataFilePath.getTrainDir()
        
        dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_poi.df.pickle'
        dumpload = DumpLoad(dumpfile_path)
        if dumpload.isExisiting():
            temp_df = dumpload.load()
        else:
            poi_dict = self.get_district_type_dict()
            
            temp_df = self.X_y_Df[['start_district_id']].apply(self.find_poi,axis = 1, poi_dict=poi_dict)
            
            dumpload.dump(temp_df)
        df = pd.concat([df, temp_df],  axis=1)
        dt_list = self.get_district_type_list()
        
        size = len(dt_list)
        col_len = 4
        row_len = 7
#         col_len = row_len = int(math.ceil(math.sqrt(size)))
#         count = 1
        _, axarr = plt.subplots(row_len, col_len, sharex=True, sharey=True)
        for row in range(row_len):
            for col in range(col_len):
                index = row * col_len + col
                if index >= size:
                    break
                item = dt_list[index]
                axarr[row, col].scatter(df[item], df['gap'])
                axarr[row, col].set_ylabel('Gap')
                axarr[row, col].set_xlabel(item)
#                 axarr[row, col].set_title('POI/Gap Correlation')
#         for item in dt_list:
#             plt.subplot(row_len, col_len, count)
#             plt.scatter(df[item], df['gap'])
#             plt.ylabel('Gap')
#             plt.xlabel('POI')
#             count += 1
        
       
#         plt.title('POI/Gap Correlation')
        return
 def get_gap_dict(self, data_dir):
     """
     indexes for quick search
     key = 'start_district_id','time_slotid
     value = gap
     """
     t0 = time()
     filename = "../data_preprocessed/" + data_dir.split('/')[-2] + '_gap.csv.dict.pickle'
     dumpload = DumpLoad( filename)
     if dumpload.isExisiting():
         return dumpload.load()
     
     resDict = {}
     df = self.load_gapdf(data_dir)
     for _, row in df.iterrows():
         resDict[tuple(row[['start_district_id','time_slotid']].tolist())] = row['gap']
     
     dumpload.dump(resDict)
     print "dump gapdict:", round(time()-t0, 3), "s"
     return resDict
 def disp_gap_byweather(self):
     df = self.gapdf
     data_dir = g_singletonDataFilePath.getTrainDir()
     dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevweather.df.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         temp_df = dumpload.load()
     else:
         weather_dict = self.get_weather_dict(data_dir)
         
         temp_df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode, weather_dict=weather_dict)     
         dumpload.dump(temp_df)
         
     df = pd.concat([df, temp_df],  axis=1)
     
     gaps_mean = df.groupby('preweather')['gap'].mean()
     gaps_mean.plot(kind='bar')
     plt.ylabel('Mean of gap')
     plt.xlabel('Weather')
     plt.title('Weather/Gap Correlation')
     return
Beispiel #25
0
 def disp_gap_byweather(self):
     df = self.gapdf
     data_dir = g_singletonDataFilePath.getTrainDir()
     dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevweather.df.pickle'
     dumpload = DumpLoad(dumpfile_path)
     if dumpload.isExisiting():
         temp_df = dumpload.load()
     else:
         weather_dict = self.get_weather_dict(data_dir)
         
         temp_df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode, weather_dict=weather_dict)     
         dumpload.dump(temp_df)
         
     df = pd.concat([df, temp_df],  axis=1)
     
     gaps_mean = df.groupby('preweather')['gap'].mean()
     gaps_mean.plot(kind='bar')
     plt.ylabel('Mean of gap')
     plt.xlabel('Weather')
     plt.title('Weather/Gap Correlation')
     return
Beispiel #26
0
    def get_weather_dict(self, data_dir):
        t0 = time()
        filename = '../data_raw/' + data_dir.split(
            '/')[-2] + '_weather.csv.dict.pickle'
        dumpload = DumpLoad(filename)
        if dumpload.isExisiting():
            return dumpload.load()

        resDict = {}
        df = self.load_weatherdf(data_dir)
        for index, row in df.iterrows():
            resDict[row['time_slotid']] = (index, row['weather'],
                                           row['temparature'], row['pm25'])
        for name, group in df.groupby('time_date'):
            resDict[name] = (-1, mode(group['weather'])[0][0],
                             mode(group['temparature'])[0][0],
                             mode(group['pm25'])[0][0])

        dumpload.dump(resDict)
        print "dump weather dict:", round(time() - t0, 3), "s"
        return resDict
    def get_gap_dict(self, data_dir):
        """
        indexes for quick search
        key = 'start_district_id','time_slotid
        value = gap
        """
        t0 = time()
        filename = "../data_preprocessed/" + data_dir.split(
            '/')[-2] + '_gap.csv.dict.pickle'
        dumpload = DumpLoad(filename)
        if dumpload.isExisiting():
            return dumpload.load()

        resDict = {}
        df = self.load_gapdf(data_dir)
        for _, row in df.iterrows():
            resDict[tuple(row[['start_district_id',
                               'time_slotid']].tolist())] = row['gap']

        dumpload.dump(resDict)
        print("dump gapdict:", round(time() - t0, 3), "s")
        return resDict
Beispiel #28
0
		dictionary[word] = len(dictionary)
	data = list()
	unk_count = 0
	for word in words:
		if word in dictionary:
			index = dictionary[word]
		else:
			index = 0	# dictionary['UNK']
			unk_count = unk_count + 1
		data.append(index)
	count[0][1] = unk_count
	reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
	return data, count, dictionary, reverse_dictionary

rawdata = DumpLoad('./data/rawdata.pickle')
if not rawdata.isExisiting():
# 	data, count, dictionary, reverse_dictionary = build_dataset(words)
	rawdata.dump(build_dataset(words))
data, count, dictionary, reverse_dictionary = rawdata.load()

print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words	# Hint to reduce memory.



data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
	global data_index
	assert batch_size % num_skips == 0