def get_global_data_matrix_from_mongodb(self, start, end, features): db = client[self._dbName] dateSet = set() for stockName in self.stockList: df = pd.DataFrame(list(db[stockName].find().sort('date'))) df = df.set_index('date').loc[start:end] dateSet = dateSet.union(list(df.index)) dateSet = sorted(list(dateSet)) panel = pd.Panel(items=features, major_axis=self.stockList, minor_axis=dateSet, dtype=np.float32) for row_number, stockName in enumerate(self.stockList): df = pd.DataFrame(list( db[stockName].find().sort('date')))[features + ['date']] df = df.set_index('date') df = df.loc[start:end] for feature in features: panel.loc[feature, stockName, df.index] = df[feature].squeeze() panel = panel_fillna(panel, 'both') return panel, dateSet
def get_global_dataframe(self, start, end, period=300, features=('close', )): """ :param start/end: linux timestamp in seconds :param period: time interval of each data access point :param features: tuple or list of the feature names :return a panel, [feature, coin, time] """ start = int(start - (start % period)) end = int(end - (end % period)) coins = self.select_coins(start=end - self.__volume_forward - self.__volume_average_days * DAY, end=end - self.__volume_forward) self.__coins = coins for coin in coins: self.update_data(start, end, coin) if len(coins) != self._coin_number: raise ValueError( "the length of selected coins %d is not equal to expected %d" % (len(coins), self._coin_number)) logging.info("feature type list is %s" % str(features)) self.__checkperiod(period) time_index = pd.to_datetime(list(range(start, end + 1, period)), unit='s') panel = pd.DataFrame(columns=pd.MultiIndex.from_product( [coins, features]), index=time_index, dtype="float64") #Switch from Panel to Dataframe with MultiIndex connection = sqlite3.connect(DATABASE_DIR) try: for row_number, coin in enumerate(coins): for feature in features: # NOTE: transform the start date to end date if feature == "close": sql = ( "SELECT date+300 AS date_norm, close FROM History WHERE" " date_norm>={start} and date_norm<={end}" " and date_norm%{period}=0 and coin=\"{coin}\"". format(start=start, end=end, period=period, coin=coin)) elif feature == "open": sql = ( "SELECT date+{period} AS date_norm, open FROM History WHERE" " date_norm>={start} and date_norm<={end}" " and date_norm%{period}=0 and coin=\"{coin}\"". format(start=start, end=end, period=period, coin=coin)) elif feature == "volume": sql = ( "SELECT date_norm, SUM(volume)" + " FROM (SELECT date+{period}-(date%{period}) " "AS date_norm, volume, coin FROM History)" " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\"" " GROUP BY date_norm".format( period=period, start=start, end=end, coin=coin)) elif feature == "high": sql = ( "SELECT date_norm, MAX(high)" + " FROM (SELECT date+{period}-(date%{period})" " AS date_norm, high, coin FROM History)" " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\"" " GROUP BY date_norm".format( period=period, start=start, end=end, coin=coin)) elif feature == "low": sql = ( "SELECT date_norm, MIN(low)" + " FROM (SELECT date+{period}-(date%{period})" " AS date_norm, low, coin FROM History)" " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\"" " GROUP BY date_norm".format( period=period, start=start, end=end, coin=coin)) else: msg = ("The feature %s is not supported" % feature) logging.error(msg) raise ValueError(msg) serial_data = pd.read_sql_query( sql, con=connection, parse_dates=["date_norm"], index_col="date_norm") # serial_data.index panel.loc[serial_data.index, (coin, feature)] = serial_data.squeeze() finally: connection.commit() connection.close() panel = panel_fillna(panel, "both") print(panel) return panel
def get_global_panel(self, start, end, period=300, features=('close', )): """ :param start/end: linux timestamp in seconds :param period: time interval of each data access point :param features: tuple or list of the feature names :return a panel, [feature, asset, time] """ start = int(start - (start % period)) end = int(end - (end % period)) assets = self.select_assets(start=end - self.__volume_forward - self.__volume_average_days * DAY, end=end - self.__volume_forward) self.__assets = assets if len(assets) != self._asset_number: raise ValueError( "the length of selected assets %d is not equal to expected %d" % (len(assets), self._asset_number)) logging.info("feature type list is %s" % str(features)) self.__checkperiod(period) # time_index를 한국 장 시간에 딱 맞추기 위해 아래 메소드 사용 time_index = pd.to_datetime( self._gen_time_index(self._unix_to_yyyymmdd(start), self._unix_to_yyyymmdd(end))) panel = pd.Panel(items=features, major_axis=assets, minor_axis=time_index, dtype=np.float32) connection = sqlite3.connect(DATABASE_DIR) try: for row_number, asset in enumerate(assets): for feature in features: # NOTE: transform the start date to end date if feature == "close": sql = ( "SELECT date AS date_norm, close FROM History WHERE" " date_norm>={start} and date_norm<={end}" " and date_norm%{period}=0 and asset=\"{asset}\"". format(start=start, end=end, period=period, asset=asset)) elif feature == "open": sql = ( "SELECT date AS date_norm, open FROM History WHERE" " date_norm>={start} and date_norm<={end}" " and date_norm%{period}=0 and asset=\"{asset}\"". format(start=start, end=end, period=period, asset=asset)) elif feature == "volume": sql = ( "SELECT date_norm, SUM(volume)" + " FROM (SELECT date-(date%{period}) " "AS date_norm, volume, asset FROM History)" " WHERE date_norm>={start} and date_norm<={end} and asset=\"{asset}\"" " GROUP BY date_norm".format(period=period, start=start, end=end, asset=asset)) elif feature == "high": sql = ( "SELECT date_norm, MAX(high)" + " FROM (SELECT date-(date%{period})" " AS date_norm, high, asset FROM History)" " WHERE date_norm>={start} and date_norm<={end} and asset=\"{asset}\"" " GROUP BY date_norm".format(period=period, start=start, end=end, asset=asset)) elif feature == "low": sql = ( "SELECT date_norm, MIN(low)" + " FROM (SELECT date-(date%{period})" " AS date_norm, low, asset FROM History)" " WHERE date_norm>={start} and date_norm<={end} and asset=\"{asset}\"" " GROUP BY date_norm".format(period=period, start=start, end=end, asset=asset)) else: msg = ("The feature %s is not supported" % feature) logging.error(msg) raise ValueError(msg) serial_data = pd.read_sql_query(sql, con=connection, parse_dates=["date_norm"], index_col="date_norm") # 한국 시간(GMT+9)에 맞추기 serial_data.set_index(serial_data.index + dt.timedelta(hours=9), inplace=True) panel.loc[feature, asset, serial_data.index] = serial_data.squeeze() panel = panel_fillna(panel, "both") finally: connection.commit() connection.close() return panel
def get_global_panel(self, start, end, period=300, features=('close',)): """ :param start/end: linux timestamp in seconds :param period: time interval of each data access point :param features: tuple or list of the feature names :return a panel, [feature, coin, time] """ start = int(start - (start%period)) end = int(end - (end%period)) coins = self.select_coins(start=end - self.__volume_forward - self.__volume_average_days * DAY, end=end-self.__volume_forward) self.__coins = coins for coin in coins: self.update_data(start, end, coin) if len(coins)!=self._coin_number: raise ValueError("the length of selected coins %d is not equal to expected %d" % (len(coins), self._coin_number)) logging.info("feature type list is %s" % str(features)) self.__checkperiod(period) time_index = pd.to_datetime(list(range(start, end+1, period)),unit='s') panel = pd.Panel(items=features, major_axis=coins, minor_axis=time_index, dtype=np.float32) connection = sqlite3.connect(DATABASE_DIR) try: for row_number, coin in enumerate(coins): for feature in features: # NOTE: transform the start date to end date if feature == "close": sql = ("SELECT date+300 AS date_norm, close FROM History WHERE" " date_norm>={start} and date_norm<={end}" " and date_norm%{period}=0 and coin=\"{coin}\"".format( start=start, end=end, period=period, coin=coin)) elif feature == "open": sql = ("SELECT date+{period} AS date_norm, open FROM History WHERE" " date_norm>={start} and date_norm<={end}" " and date_norm%{period}=0 and coin=\"{coin}\"".format( start=start, end=end, period=period, coin=coin)) elif feature == "volume": sql = ("SELECT date_norm, SUM(volume)"+ " FROM (SELECT date+{period}-(date%{period}) " "AS date_norm, volume, coin FROM History)" " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\"" " GROUP BY date_norm".format( period=period,start=start,end=end,coin=coin)) elif feature == "high": sql = ("SELECT date_norm, MAX(high)" + " FROM (SELECT date+{period}-(date%{period})" " AS date_norm, high, coin FROM History)" " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\"" " GROUP BY date_norm".format( period=period,start=start,end=end,coin=coin)) elif feature == "low": sql = ("SELECT date_norm, MIN(low)" + " FROM (SELECT date+{period}-(date%{period})" " AS date_norm, low, coin FROM History)" " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\"" " GROUP BY date_norm".format( period=period,start=start,end=end,coin=coin)) else: msg = ("The feature %s is not supported" % feature) logging.error(msg) raise ValueError(msg) serial_data = pd.read_sql_query(sql, con=connection, parse_dates=["date_norm"], index_col="date_norm") panel.loc[feature, coin, serial_data.index] = serial_data.squeeze() panel = panel_fillna(panel, "both") finally: connection.commit() connection.close() return panel
def get_global_panel(self, start, end, period, features): """ :param start/end: linux timestamp in seconds :param period: time interval of each data access point :param features: tuple or list of the feature names :return a panel, [feature, coin, time] """ start = int(start) end = int(end) coins = self.select_coins(start=start, end=end) self.__coins = coins for coin in coins: self.update_data(start, end, coin) if len(coins) != self._coin_number: raise ValueError( "the length of selected coins %d is not equal to expected %d" % (len(coins), self._coin_number)) logging.info("feature type list is %s" % str(features)) self.__checkperiod(period) connection = sqlite3.connect(DATABASE_DIR) time_index = pd.to_datetime(list(range(start, end + 1, period)), unit='s') time_index = time_index.normalize() panel = pd.Panel(items=features, major_axis=coins, minor_axis=time_index, dtype=np.float32) try: for row_number, coin in enumerate(coins): for feature in features: # NOTE: transform the start date to end date if feature == "close": sql = ( "SELECT date as date_norm, close FROM History WHERE" " date_norm>={start} and date_norm<={end}" " and name=\"{coin}\"".format(start=start, end=end, coin=coin)) elif feature == "open": sql = ( "SELECT date+{period} AS date_norm, open FROM History WHERE" " date_norm>={start} and date_norm<={end}" " and name=\"{coin}\"".format(start=start, end=end, period=period, coin=coin)) elif feature == "high": sql = ( "SELECT date_norm, MAX(high)" + " FROM (SELECT date+{period}" " AS date_norm, high, name FROM History)" " WHERE date_norm>={start} and date_norm<={end} and name=\"{coin}\"" " GROUP BY date_norm".format( period=period, start=start, end=end, coin=coin)) elif feature == "low": sql = ( "SELECT date_norm, MIN(low)" + " FROM (SELECT date+{period}" " AS date_norm, low, name FROM History)" " WHERE date_norm>={start} and date_norm<={end} and name=\"{coin}\"" " GROUP BY date_norm".format( period=period, start=start, end=end, coin=coin)) else: msg = ("The feature %s is not supported" % feature) logging.error(msg) raise ValueError(msg) serial_data = pd.read_sql_query(sql, con=connection, parse_dates=["date_norm"]) temp = serial_data["date_norm"].dt.normalize() del serial_data['date_norm'] serial_data.index = temp squeezed_data = serial_data.squeeze() panel.loc[feature, coin, serial_data.index] = squeezed_data panel = panel_fillna(panel, "both") finally: connection.commit() connection.close() return panel