コード例 #1
0
    def _merge_data(dfs, index_name='trade_date'):
        """
        Merge data from different APIs into one DataFrame.

        Parameters
        ----------
        dfs : list of pd.DataFrame

        Returns
        -------
        merge : pd.DataFrame or None
            If dfs is empty, return None

        Notes
        -----
        Align on date index, concatenate on columns (symbol and fields)

        """
        merge = quick_concat(dfs, ['symbol', 'field'])

        mask_duplicated = merge.columns.duplicated()
        if np.any(mask_duplicated):
            merge = merge.loc[:, ~mask_duplicated]
        merge = merge.sort_index(axis=1, level=['symbol', 'field'])
        merge.index.name = index_name

        return merge
コード例 #2
0
    def append_df(self, df, field_name, overwrite=True):
        """
        Append DataFrame to existing multi-index DataFrame and add corresponding field name.

        Parameters
        ----------
        df : pd.DataFrame or pd.Series
        field_name : str or unicode
        overwrite : bool, optional
            Whether overwrite existing field. True by default.
        Notes
        -----
        append_df does not support overwrite. To overwrite a field, you must first do self.remove_fields(),
        then append_df() again.

        """

        exist_fields = self.data.columns.remove_unused_levels().levels[1]

        if field_name in exist_fields:
            if overwrite:
                self.remove_field(field_name)
                print("Field [{:s}] is overwritten.".format(field_name))
            else:
                print("Append df failed: name [{:s}] exist. Try another name.".
                      format(field_name))
                return

        df = df.copy()
        if isinstance(df, pd.DataFrame):
            pass
        elif isinstance(df, pd.Series):
            df = pd.DataFrame(df)
        else:
            raise ValueError(
                "Data to be appended must be pandas format. But we have {}".
                format(type(df)))

        the_data = self.data
        exist_symbols = the_data.columns.levels[0]

        if len(df.columns) < len(exist_symbols):
            df2 = pd.DataFrame(index=df.index,
                               columns=exist_symbols,
                               data=np.nan)
            df2.update(df)
            df = df2
        elif len(df.columns) > len(exist_symbols):
            df = df.loc[:, exist_symbols]
        multi_idx = pd.MultiIndex.from_product([exist_symbols, [field_name]])
        df.columns = multi_idx
        the_data = quick_concat(
            [the_data, df.reindex(the_data.index)], ["symbol", "field"],
            how="inner")
        the_data = the_data.sort_index(axis=1)

        self.data = the_data
        self._add_field(field_name)
コード例 #3
0
    def _merge_data(dfs, index_name='trade_date'):
        """
        Merge data from different APIs into one DataFrame.

        Parameters
        ----------
        dfs : list of pd.DataFrame

        Returns
        -------
        merge : pd.DataFrame or None
            If dfs is empty, return None

        Notes
        -----
        Align on date index, concatenate on columns (symbol and fields)

        """
        # dfs = [df for df in dfs if df is not None]

        # 这里用优化后的快速concat方法取代原生pandas的concat方法,在columns较长的情况下有明显提速
        # merge = pd.concat(dfs, axis=1, join='outer')
        merge = quick_concat(dfs, ['symbol', 'field'])

        # drop duplicated columns. ONE LINE EFFICIENT version
        mask_duplicated = merge.columns.duplicated()
        if np.any(mask_duplicated):
            # print("Duplicated columns found. Dropped.")
            merge = merge.loc[:, ~mask_duplicated]

            # if merge.isnull().sum().sum() > 0:
            # print "WARNING: nan in final merged data. NO fill"
            # merge.fillna(method='ffill', inplace=True)

        merge = merge.sort_index(axis=1, level=['symbol', 'field'])
        merge.index.name = index_name

        return merge
コード例 #4
0
    def append_df(self, df, field_name, is_quarterly=False, overwrite=True):
        """
        Append DataFrame to existing multi-index DataFrame and add corresponding field name.

        Parameters
        ----------
        df : pd.DataFrame or pd.Series
        field_name : str or unicode
        is_quarterly : bool
            Whether df is quarterly data (like quarterly financial statement) or daily data.
        overwrite : bool, optional
            Whether overwrite existing field. True by default.
        Notes
        -----
        append_df does not support overwrite. To overwrite a field, you must first do self.remove_fields(),
        then append_df() again.

        """
        if is_quarterly:
            if self.data_q is None:
                raise ValueError("append_df前需要先确保季度数据集data_q不为空!")
            exist_fields = self.data_q.columns.remove_unused_levels().levels[1]
        else:
            if self.data_d is None:
                raise ValueError("append_df前需要先确保日度数据集data_d不为空!")
            exist_fields = self.data_d.columns.remove_unused_levels().levels[1]
        if field_name in exist_fields:
            if overwrite:
                self.remove_field(field_name)
                print("Field [{:s}] is overwritten.".format(field_name))
            else:
                print("Append df failed: name [{:s}] exist. Try another name.".
                      format(field_name))
                return

        # 季度添加至data_q 日度添加至data_d
        df = df.copy()
        if isinstance(df, pd.DataFrame):
            pass
        elif isinstance(df, pd.Series):
            df = pd.DataFrame(df)
        else:
            raise ValueError(
                "Data to be appended must be pandas format. But we have {}".
                format(type(df)))

        if is_quarterly:
            the_data = self.data_q
        else:
            the_data = self.data_d

        exist_symbols = the_data.columns.levels[0]
        if len(df.columns) < len(exist_symbols):
            df2 = pd.DataFrame(index=df.index,
                               columns=exist_symbols,
                               data=np.nan)
            df2.update(df)
            df = df2
        elif len(df.columns) > len(exist_symbols):
            df = df.loc[:, exist_symbols]
        multi_idx = pd.MultiIndex.from_product([exist_symbols, [field_name]])
        df.columns = multi_idx

        # the_data = apply_in_subprocess(pd.merge, args=(the_data, df),
        #                            kwargs={'left_index': True, 'right_index': True, 'how': 'left'})  # runs in *only* one process
        # the_data = pd.merge(the_data, df, left_index=True, right_index=True, how='left')
        the_data = quick_concat(
            [the_data, df.reindex(the_data.index)], ["symbol", "field"],
            how="inner")
        the_data = the_data.sort_index(axis=1)
        # merge = the_data.join(df, how='left')  # left: keep index of existing data unchanged
        # sort_columns(the_data)

        if is_quarterly:
            self.data_q = the_data
        else:
            self.data_d = the_data
        self._add_field(field_name, is_quarterly)