def get_global_data_matrix_from_mongodb(self, start, end, features):
        db = client[self._dbName]
        dateSet = set()
        for stockName in self.stockList:
            df = pd.DataFrame(list(db[stockName].find().sort('date')))
            df = df.set_index('date').loc[start:end]
            dateSet = dateSet.union(list(df.index))
        dateSet = sorted(list(dateSet))

        panel = pd.Panel(items=features,
                         major_axis=self.stockList,
                         minor_axis=dateSet,
                         dtype=np.float32)
        for row_number, stockName in enumerate(self.stockList):
            df = pd.DataFrame(list(
                db[stockName].find().sort('date')))[features + ['date']]
            df = df.set_index('date')
            df = df.loc[start:end]
            for feature in features:
                panel.loc[feature, stockName, df.index] = df[feature].squeeze()
                panel = panel_fillna(panel, 'both')
        return panel, dateSet
Beispiel #2
0
    def get_global_dataframe(self,
                             start,
                             end,
                             period=300,
                             features=('close', )):
        """
        :param start/end: linux timestamp in seconds
        :param period: time interval of each data access point
        :param features: tuple or list of the feature names
        :return a panel, [feature, coin, time]
        """
        start = int(start - (start % period))
        end = int(end - (end % period))
        coins = self.select_coins(start=end - self.__volume_forward -
                                  self.__volume_average_days * DAY,
                                  end=end - self.__volume_forward)
        self.__coins = coins
        for coin in coins:
            self.update_data(start, end, coin)

        if len(coins) != self._coin_number:
            raise ValueError(
                "the length of selected coins %d is not equal to expected %d" %
                (len(coins), self._coin_number))

        logging.info("feature type list is %s" % str(features))
        self.__checkperiod(period)

        time_index = pd.to_datetime(list(range(start, end + 1, period)),
                                    unit='s')
        panel = pd.DataFrame(columns=pd.MultiIndex.from_product(
            [coins, features]),
                             index=time_index,
                             dtype="float64")

        #Switch from Panel to Dataframe with MultiIndex
        connection = sqlite3.connect(DATABASE_DIR)
        try:
            for row_number, coin in enumerate(coins):
                for feature in features:
                    # NOTE: transform the start date to end date
                    if feature == "close":
                        sql = (
                            "SELECT date+300 AS date_norm, close FROM History WHERE"
                            " date_norm>={start} and date_norm<={end}"
                            " and date_norm%{period}=0 and coin=\"{coin}\"".
                            format(start=start,
                                   end=end,
                                   period=period,
                                   coin=coin))
                    elif feature == "open":
                        sql = (
                            "SELECT date+{period} AS date_norm, open FROM History WHERE"
                            " date_norm>={start} and date_norm<={end}"
                            " and date_norm%{period}=0 and coin=\"{coin}\"".
                            format(start=start,
                                   end=end,
                                   period=period,
                                   coin=coin))
                    elif feature == "volume":
                        sql = (
                            "SELECT date_norm, SUM(volume)" +
                            " FROM (SELECT date+{period}-(date%{period}) "
                            "AS date_norm, volume, coin FROM History)"
                            " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\""
                            " GROUP BY date_norm".format(
                                period=period, start=start, end=end,
                                coin=coin))
                    elif feature == "high":
                        sql = (
                            "SELECT date_norm, MAX(high)" +
                            " FROM (SELECT date+{period}-(date%{period})"
                            " AS date_norm, high, coin FROM History)"
                            " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\""
                            " GROUP BY date_norm".format(
                                period=period, start=start, end=end,
                                coin=coin))
                    elif feature == "low":
                        sql = (
                            "SELECT date_norm, MIN(low)" +
                            " FROM (SELECT date+{period}-(date%{period})"
                            " AS date_norm, low, coin FROM History)"
                            " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\""
                            " GROUP BY date_norm".format(
                                period=period, start=start, end=end,
                                coin=coin))
                    else:
                        msg = ("The feature %s is not supported" % feature)
                        logging.error(msg)
                        raise ValueError(msg)
                    serial_data = pd.read_sql_query(
                        sql,
                        con=connection,
                        parse_dates=["date_norm"],
                        index_col="date_norm")  # serial_data.index
                    panel.loc[serial_data.index,
                              (coin, feature)] = serial_data.squeeze()

        finally:
            connection.commit()
            connection.close()

        panel = panel_fillna(panel, "both")
        print(panel)
        return panel
    def get_global_panel(self, start, end, period=300, features=('close', )):
        """
        :param start/end: linux timestamp in seconds
        :param period: time interval of each data access point
        :param features: tuple or list of the feature names
        :return a panel, [feature, asset, time]
        """
        start = int(start - (start % period))
        end = int(end - (end % period))
        assets = self.select_assets(start=end - self.__volume_forward -
                                    self.__volume_average_days * DAY,
                                    end=end - self.__volume_forward)
        self.__assets = assets

        if len(assets) != self._asset_number:
            raise ValueError(
                "the length of selected assets %d is not equal to expected %d"
                % (len(assets), self._asset_number))

        logging.info("feature type list is %s" % str(features))
        self.__checkperiod(period)

        # time_index를 한국 장 시간에 딱 맞추기 위해 아래 메소드 사용
        time_index = pd.to_datetime(
            self._gen_time_index(self._unix_to_yyyymmdd(start),
                                 self._unix_to_yyyymmdd(end)))
        panel = pd.Panel(items=features,
                         major_axis=assets,
                         minor_axis=time_index,
                         dtype=np.float32)

        connection = sqlite3.connect(DATABASE_DIR)
        try:
            for row_number, asset in enumerate(assets):
                for feature in features:
                    # NOTE: transform the start date to end date
                    if feature == "close":
                        sql = (
                            "SELECT date AS date_norm, close FROM History WHERE"
                            " date_norm>={start} and date_norm<={end}"
                            " and date_norm%{period}=0 and asset=\"{asset}\"".
                            format(start=start,
                                   end=end,
                                   period=period,
                                   asset=asset))
                    elif feature == "open":
                        sql = (
                            "SELECT date AS date_norm, open FROM History WHERE"
                            " date_norm>={start} and date_norm<={end}"
                            " and date_norm%{period}=0 and asset=\"{asset}\"".
                            format(start=start,
                                   end=end,
                                   period=period,
                                   asset=asset))
                    elif feature == "volume":
                        sql = (
                            "SELECT date_norm, SUM(volume)" +
                            " FROM (SELECT date-(date%{period}) "
                            "AS date_norm, volume, asset FROM History)"
                            " WHERE date_norm>={start} and date_norm<={end} and asset=\"{asset}\""
                            " GROUP BY date_norm".format(period=period,
                                                         start=start,
                                                         end=end,
                                                         asset=asset))
                    elif feature == "high":
                        sql = (
                            "SELECT date_norm, MAX(high)" +
                            " FROM (SELECT date-(date%{period})"
                            " AS date_norm, high, asset FROM History)"
                            " WHERE date_norm>={start} and date_norm<={end} and asset=\"{asset}\""
                            " GROUP BY date_norm".format(period=period,
                                                         start=start,
                                                         end=end,
                                                         asset=asset))
                    elif feature == "low":
                        sql = (
                            "SELECT date_norm, MIN(low)" +
                            " FROM (SELECT date-(date%{period})"
                            " AS date_norm, low, asset FROM History)"
                            " WHERE date_norm>={start} and date_norm<={end} and asset=\"{asset}\""
                            " GROUP BY date_norm".format(period=period,
                                                         start=start,
                                                         end=end,
                                                         asset=asset))
                    else:
                        msg = ("The feature %s is not supported" % feature)
                        logging.error(msg)
                        raise ValueError(msg)
                    serial_data = pd.read_sql_query(sql,
                                                    con=connection,
                                                    parse_dates=["date_norm"],
                                                    index_col="date_norm")
                    # 한국 시간(GMT+9)에 맞추기
                    serial_data.set_index(serial_data.index +
                                          dt.timedelta(hours=9),
                                          inplace=True)
                    panel.loc[feature, asset,
                              serial_data.index] = serial_data.squeeze()
                    panel = panel_fillna(panel, "both")
        finally:
            connection.commit()
            connection.close()
        return panel
    def get_global_panel(self, start, end, period=300, features=('close',)):
        """
        :param start/end: linux timestamp in seconds
        :param period: time interval of each data access point
        :param features: tuple or list of the feature names
        :return a panel, [feature, coin, time]
        """
        start = int(start - (start%period))
        end = int(end - (end%period))
        coins = self.select_coins(start=end - self.__volume_forward - self.__volume_average_days * DAY,
                                  end=end-self.__volume_forward)
        self.__coins = coins
        for coin in coins:
            self.update_data(start, end, coin)

        if len(coins)!=self._coin_number:
            raise ValueError("the length of selected coins %d is not equal to expected %d"
                             % (len(coins), self._coin_number))

        logging.info("feature type list is %s" % str(features))
        self.__checkperiod(period)

        time_index = pd.to_datetime(list(range(start, end+1, period)),unit='s')
        panel = pd.Panel(items=features, major_axis=coins, minor_axis=time_index, dtype=np.float32)

        connection = sqlite3.connect(DATABASE_DIR)
        try:
            for row_number, coin in enumerate(coins):
                for feature in features:
                    # NOTE: transform the start date to end date
                    if feature == "close":
                        sql = ("SELECT date+300 AS date_norm, close FROM History WHERE"
                               " date_norm>={start} and date_norm<={end}" 
                               " and date_norm%{period}=0 and coin=\"{coin}\"".format(
                               start=start, end=end, period=period, coin=coin))
                    elif feature == "open":
                        sql = ("SELECT date+{period} AS date_norm, open FROM History WHERE"
                               " date_norm>={start} and date_norm<={end}" 
                               " and date_norm%{period}=0 and coin=\"{coin}\"".format(
                               start=start, end=end, period=period, coin=coin))
                    elif feature == "volume":
                        sql = ("SELECT date_norm, SUM(volume)"+
                               " FROM (SELECT date+{period}-(date%{period}) "
                               "AS date_norm, volume, coin FROM History)"
                               " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\""
                               " GROUP BY date_norm".format(
                                    period=period,start=start,end=end,coin=coin))
                    elif feature == "high":
                        sql = ("SELECT date_norm, MAX(high)" +
                               " FROM (SELECT date+{period}-(date%{period})"
                               " AS date_norm, high, coin FROM History)"
                               " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\""
                               " GROUP BY date_norm".format(
                                    period=period,start=start,end=end,coin=coin))
                    elif feature == "low":
                        sql = ("SELECT date_norm, MIN(low)" +
                                " FROM (SELECT date+{period}-(date%{period})"
                                " AS date_norm, low, coin FROM History)"
                                " WHERE date_norm>={start} and date_norm<={end} and coin=\"{coin}\""
                                " GROUP BY date_norm".format(
                                    period=period,start=start,end=end,coin=coin))
                    else:
                        msg = ("The feature %s is not supported" % feature)
                        logging.error(msg)
                        raise ValueError(msg)
                    serial_data = pd.read_sql_query(sql, con=connection,
                                                    parse_dates=["date_norm"],
                                                    index_col="date_norm")
                    panel.loc[feature, coin, serial_data.index] = serial_data.squeeze()
                    panel = panel_fillna(panel, "both")
        finally:
            connection.commit()
            connection.close()
        return panel
Beispiel #5
0
    def get_global_panel(self, start, end, period, features):
        """
        :param start/end: linux timestamp in seconds
        :param period: time interval of each data access point
        :param features: tuple or list of the feature names
        :return a panel, [feature, coin, time]
        """
        start = int(start)
        end = int(end)
        coins = self.select_coins(start=start, end=end)
        self.__coins = coins
        for coin in coins:
            self.update_data(start, end, coin)

        if len(coins) != self._coin_number:
            raise ValueError(
                "the length of selected coins %d is not equal to expected %d" %
                (len(coins), self._coin_number))

        logging.info("feature type list is %s" % str(features))
        self.__checkperiod(period)
        connection = sqlite3.connect(DATABASE_DIR)

        time_index = pd.to_datetime(list(range(start, end + 1, period)),
                                    unit='s')
        time_index = time_index.normalize()
        panel = pd.Panel(items=features,
                         major_axis=coins,
                         minor_axis=time_index,
                         dtype=np.float32)

        try:
            for row_number, coin in enumerate(coins):
                for feature in features:
                    # NOTE: transform the start date to end date
                    if feature == "close":
                        sql = (
                            "SELECT date as date_norm, close FROM History WHERE"
                            " date_norm>={start} and date_norm<={end}"
                            " and name=\"{coin}\"".format(start=start,
                                                          end=end,
                                                          coin=coin))
                    elif feature == "open":
                        sql = (
                            "SELECT date+{period} AS date_norm, open FROM History WHERE"
                            " date_norm>={start} and date_norm<={end}"
                            " and name=\"{coin}\"".format(start=start,
                                                          end=end,
                                                          period=period,
                                                          coin=coin))
                    elif feature == "high":
                        sql = (
                            "SELECT date_norm, MAX(high)" +
                            " FROM (SELECT date+{period}"
                            " AS date_norm, high, name FROM History)"
                            " WHERE date_norm>={start} and date_norm<={end} and name=\"{coin}\""
                            " GROUP BY date_norm".format(
                                period=period, start=start, end=end,
                                coin=coin))

                    elif feature == "low":
                        sql = (
                            "SELECT date_norm, MIN(low)" +
                            " FROM (SELECT date+{period}"
                            " AS date_norm, low, name FROM History)"
                            " WHERE date_norm>={start} and date_norm<={end} and name=\"{coin}\""
                            " GROUP BY date_norm".format(
                                period=period, start=start, end=end,
                                coin=coin))
                    else:
                        msg = ("The feature %s is not supported" % feature)
                        logging.error(msg)
                        raise ValueError(msg)
                    serial_data = pd.read_sql_query(sql,
                                                    con=connection,
                                                    parse_dates=["date_norm"])

                    temp = serial_data["date_norm"].dt.normalize()
                    del serial_data['date_norm']
                    serial_data.index = temp
                    squeezed_data = serial_data.squeeze()
                    panel.loc[feature, coin, serial_data.index] = squeezed_data
                    panel = panel_fillna(panel, "both")
        finally:
            connection.commit()
            connection.close()
        return panel