Esempio n. 1
0
    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path,
                                          self.data_path))

        border1s = [0, 12*30*24 - self.seq_len, 12*30*24+4*30*24 - self.seq_len]
        border2s = [12*30*24, 12*30*24+4*30*24, 12*30*24+8*30*24]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]
        
        if self.features=='M' or self.features=='MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features=='S':
            df_data = df_raw[[self.target]]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values
            
        df_stamp = df_raw[['date']][border1:border2]
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        data_stamp = time_features(df_stamp, timeenc=self.timeenc, freq=self.freq)

        self.data_x = data[border1:border2]
        if self.inverse:
            self.data_y = df_data.values[border1:border2]
        else:
            self.data_y = data[border1:border2]
        self.data_stamp = data_stamp
Esempio n. 2
0
    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path,
                                          self.data_name))

        # 0,
        # num of hours in a year - num of hours in 4 days,
        # num of hours in a year + num of hours in 4 months - num of hours in 4 days
        border1s = [0, 12*30*24 - self.seq_len, 12*30*24+4*30*24 - self.seq_len]
        # num of hours in a year,
        # num of hours in a year + num of hours in 4 months,
        # num of hours in a year + num of hours in 8 months
        border2s = [12*30*24, 12*30*24+4*30*24, 12*30*24+8*30*24]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]


        if self.features=='M' or self.features=='MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features=='S':
            df_data = df_raw[[self.target]]

        #  print(df_data.shape)

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values


        # retrieve one year's record
        df_stamp = df_raw[['date']][border1:border2]
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        if self.timeenc == 0:
            df_stamp['month'] = df_stamp.date.apply(lambda row: (row.month))
            df_stamp['day'] = df_stamp.date.apply(lambda row: row.day)
            df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday())
            df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour)
            data_stamp = df_stamp.drop(['date'], axis=1).values
        elif self.timeenc == 1:
            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
            data_stamp = data_stamp.transpose(1,0)


        #  print(df_stamp)
        #  print(data_stamp)


        # x and y are identical here
        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp
    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path, self.data_path))
        '''
        df_raw.columns: ['date', ...(other features), target feature]
        '''
        # cols = list(df_raw.columns);
        if self.cols:
            cols = self.cols.copy()
            cols.remove(self.target)
        else:
            cols = list(df_raw.columns)
            # print(cols)
            cols.remove(self.target)
            cols.remove('date')
        df_raw = df_raw[['date'] + cols + [self.target]]

        num_train = int(len(df_raw) * 0.7)
        num_test = int(len(df_raw) * 0.2)
        num_vali = len(df_raw) - num_train - num_test
        border1s = [
            0, num_train - self.seq_len,
            len(df_raw) - num_test - self.seq_len
        ]
        border2s = [num_train, num_train + num_vali, len(df_raw)]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]

        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features == 'S':
            df_data = df_raw[[self.target]]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values

        df_stamp = df_raw[['date']][border1:border2]
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        data_stamp = time_features(df_stamp,
                                   timeenc=self.timeenc,
                                   freq=self.freq)

        self.data_x = data[border1:border2]
        if self.inverse:
            self.data_y = df_data.values[border1:border2]
        else:
            self.data_y = data[border1:border2]
        self.data_stamp = data_stamp
Esempio n. 4
0
    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path,
                                          self.data_path))
        if self.delete_column_list:
            df_raw = df_raw.drop(self.delete_column_list, axis=1)
        if self.date_column:
            df_raw.rename(columns={self.date_column: 'date'},inplace = True)
        print(df_raw.columns)
        '''
        df_raw.columns: ['date', ...(other features), target feature]
        '''
        cols = list(df_raw.columns); cols.remove(self.target)
        df_raw = df_raw[cols+[self.target]]

        num_train = int(len(df_raw)*0.7)
        num_test = int(len(df_raw)*0.2)
        num_vali = len(df_raw) - num_train - num_test
        border1s = [0, num_train-self.seq_len, len(df_raw)-num_test-self.seq_len]
        border2s = [num_train, num_train+num_vali, len(df_raw)]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]
        
        if self.features=='M' or self.features=='MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features=='S':
            df_data = df_raw[[self.target]]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values
            
        df_stamp = df_raw[['date']][border1:border2]
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        print(df_stamp.head())
        if self.timeenc==0:
            df_stamp['month'] = df_stamp.date.apply(lambda row:row.month,1)
            df_stamp['day'] = df_stamp.date.apply(lambda row:row.day,1)
            df_stamp['weekday'] = df_stamp.date.apply(lambda row:row.weekday(),1)
            df_stamp['hour'] = df_stamp.date.apply(lambda row:row.hour,1)
            data_stamp = df_stamp.drop(['date'],1).values
        elif self.timeenc==1:
            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
            data_stamp = data_stamp.transpose(1,0)

        print(df_stamp.head())
        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp
    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path, self.data_path))
        '''
        df_raw.columns: ['date', ...(other features), target feature]
        '''
        if self.cols:
            cols = self.cols.copy()
            cols.remove(self.target)
        else:
            cols = list(df_raw.columns)
            cols.remove(self.target)
            cols.remove('date')
        df_raw = df_raw[['date'] + cols + [self.target]]

        # 予測では最後の値を使う
        border1 = len(df_raw) - self.seq_len
        border2 = len(df_raw)

        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features == 'S':
            df_data = df_raw[[self.target]]

        if self.scale:
            self.scaler.fit(df_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values

        tmp_stamp = df_raw[['date']][border1:border2]
        tmp_stamp['date'] = pd.to_datetime(tmp_stamp.date)
        pred_dates = pd.date_range(tmp_stamp.date.values[-1],
                                   periods=self.pred_len + 1,
                                   freq=self.freq)
        # priods 個数指定

        df_stamp = pd.DataFrame(columns=['date'])
        df_stamp.date = list(tmp_stamp.date.values) + list(pred_dates[1:])
        data_stamp = time_features(df_stamp,
                                   timeenc=self.timeenc,
                                   freq=self.freq[-1:])

        self.data_x = data[border1:border2]
        if self.inverse:
            self.data_y = df_data.values[border1:border2]
        else:
            self.data_y = data[border1:border2]
        self.data_stamp = data_stamp
Esempio n. 6
0
    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path, self.data_path))

        border1s = [
            0, 12 * 30 * 24 * 4 - self.seq_len,
            12 * 30 * 24 * 4 + 4 * 30 * 24 * 4 - self.seq_len
        ]
        border2s = [
            12 * 30 * 24 * 4, 12 * 30 * 24 * 4 + 4 * 30 * 24 * 4,
            12 * 30 * 24 * 4 + 8 * 30 * 24 * 4
        ]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]

        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features == 'S':
            df_data = df_raw[[self.target]]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values

        df_stamp = df_raw[['date']][border1:border2]
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        if self.timeenc == 0:
            df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
            df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
            df_stamp['weekday'] = df_stamp.date.apply(
                lambda row: row.weekday(), 1)
            df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
            df_stamp['minute'] = df_stamp.date.apply(lambda row: row.minute, 1)
            df_stamp['minute'] = df_stamp.minute.map(lambda x: x // 15)
            data_stamp = df_stamp.drop(['date'], 1).values
        elif self.timeenc == 1:
            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values),
                                       freq=self.freq)
            data_stamp = data_stamp.transpose(1, 0)

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp
Esempio n. 7
0
    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path,
                                          self.data_path))
        '''
        df_raw.columns: ['date', ...(other features), target feature]
        '''
        cols = list(df_raw.columns); cols.remove(self.target); cols.remove('date')
        df_raw = df_raw[['date']+cols+[self.target]]
        
        border1 = len(df_raw)-self.seq_len
        border2 = len(df_raw)
        
        if self.features=='M' or self.features=='MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features=='S':
            df_data = df_raw[[self.target]]

        if self.scale:
            self.scaler.fit(df_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values
            
        tmp_stamp = df_raw[['date']][border1:border2]
        tmp_stamp['date'] = pd.to_datetime(tmp_stamp.date)
        pred_dates = pd.date_range(tmp_stamp.date.values[-1], periods=self.pred_len+1, freq=self.freq)
        
        df_stamp = pd.DataFrame(columns = ['date'])
        df_stamp.date = list(tmp_stamp.date.values) + list(pred_dates[1:])
        
        if self.timeenc==0:
            df_stamp['month'] = df_stamp.date.apply(lambda row:row.month,1)
            df_stamp['day'] = df_stamp.date.apply(lambda row:row.day,1)
            df_stamp['weekday'] = df_stamp.date.apply(lambda row:row.weekday(),1)
            df_stamp['hour'] = df_stamp.date.apply(lambda row:row.hour,1)
            data_stamp = df_stamp.drop(['date'],1).values
        elif self.timeenc==1:
            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq[-1:])
            data_stamp = data_stamp.transpose(1,0)

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp
    def __read_data__(self):
        # __get_item__でデータを取得しやすくするための準備
        self.scaler = StandardScaler()

        df_raw = pd.read_csv(os.path.join(
            self.root_path, self.data_path))  # ここでは二次元?? というか次元数がない
        # 12 month 30 day 24 hour
        border1s = [
            0, 12 * 30 * 24 - self.seq_len,
            12 * 30 * 24 + 4 * 30 * 24 - self.seq_len
        ]
        # train の場合は0から一年
        # val : 1年4ヶ月の96時点
        # test : 一年4ヶ月から1年八ヶ月
        border2s = [
            12 * 30 * 24, 12 * 30 * 24 + 4 * 30 * 24,
            12 * 30 * 24 + 8 * 30 * 24
        ]
        border1 = border1s[self.set_type]  # 複数形
        border2 = border2s[self.set_type]

        # 説明変数の数 multi or single 多変量か単変量か?
        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:]  # 時刻の列いがい
            df_data = df_raw[cols_data]
        elif self.features == 'S':
            df_data = df_raw[[self.target]]

        if self.scale:  # default True 正規化をするかしないか
            train_data = df_data[border1s[0]:border2s[0]]  # trainなら0から2年
            self.scaler.fit(train_data.values)  # データの標準化 標準かをしないほうがいいのか まだしない
            # 正規化の実行 訓練データの平均と分散で正規化する 答えをカンニングしないということ?
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values  # データの取得 nd array

        # これは画像であるPIL image または ndarrayのdata「Height×Width×Channel」を
        # Tensor型のdata「Channel×Height×Width」に変換するというもので,
        # transという変数がその機能を持つことを意味する.
        # なぜChannelの順が入れ替わっているかというと,機械学習をしていく上でChannelが最初のほうが
        # 都合が良いからだと思ってもらって良い.

        # 今の状態でchannelって何??

        df_stamp = df_raw[['date']][border1:border2]  # 必要な行数の時刻を取得
        df_stamp['date'] = pd.to_datetime(df_stamp.date)  # 辞書で呼び出せるように??
        # datatime型がnp.ndarray方になる
        # 二次元 時間方向 × その時刻の表現の埋め込み表現
        # ある時刻に対してその位置埋め込み表現の長さはバラバラ
        # あるデータに対しては同じ
        data_stamp = time_features(df_stamp,
                                   timeenc=self.timeenc,
                                   freq=self.freq)

        # data_xとdata_yで何が違う??

        self.data_x = data[border1:border2]  # データ 二次元
        if self.inverse:  # default false
            self.data_y = df_data.values[border1:border2]
        else:
            self.data_y = data[border1:border2]
            # transformしていた :
            # していなかった : values[border1:border2]
        self.data_stamp = data_stamp
    def __read_data__(self):
        self.scaler = StandardScaler()  # 標準化のため
        df_raw = pd.read_csv(os.path.join(self.root_path, self.data_path))
        '''
        df_raw.columns: ['date', ...(other features), target feature]
        '''
        # cols = list(df_raw.columns);
        # データの列に対して並び替えをする IndexはDatatime型にしない 列の'date'がindexになる
        # つまり自分のstepCountを使いたいなら'date'がindexになる
        if self.cols:
            cols = self.cols.copy()
            cols.remove(self.target)
        else:
            cols = list(df_raw.columns)
            cols.remove(self.target)
            cols.remove('date')
        df_raw = df_raw[['date'] + cols + [self.target]]

        num_train = int(len(df_raw) * 0.7)
        num_test = int(len(df_raw) * 0.2)
        num_vali = len(df_raw) - num_train - num_test  # つまり 1 - 0.7 - 0.2

        # 複数形のs
        # [0 訓練データまでのindex, validation 検証用データまでのindex, テストデータまでのindex]
        border1s = [
            0, num_train - self.seq_len,
            len(df_raw) - num_test - self.seq_len
        ]

        border2s = [num_train, num_train + num_vali, len(df_raw)]
        # 取り出したいデータの種類に合わせて
        border1 = border1s[self.set_type]  # どのindexから
        border2 = border2s[self.set_type]  # どのindexまでを取り出すのか

        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features == 'S':
            df_data = df_raw[[self.target]]

        if self.scale:  # 標準化するかどうか
            train_data = df_data[border1s[0]:border2s[0]]
            # axis = 0 で平均, 分散を計算
            self.scaler.fit(train_data.values)
            # 平均, 分散により標準化, dfのtransformではない
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values

        df_stamp = df_raw[['date']][border1:border2]
        # dataframeからはドットで取り出すこともできる
        # indexはintで持っておいて, date列名にdatatime型で持っておく
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        # 最終的にmarkとして扱われる
        data_stamp = time_features(df_stamp,
                                   timeenc=self.timeenc,
                                   freq=self.freq)

        self.data_x = data[border1:border2]
        if self.inverse:
            self.data_y = df_data.values[border1:border2]
        else:
            self.data_y = data[border1:border2]
        self.data_stamp = data_stamp