def _merge_data(dfs, index_name='trade_date'): """ Merge data from different APIs into one DataFrame. Parameters ---------- dfs : list of pd.DataFrame Returns ------- merge : pd.DataFrame or None If dfs is empty, return None Notes ----- Align on date index, concatenate on columns (symbol and fields) """ merge = quick_concat(dfs, ['symbol', 'field']) mask_duplicated = merge.columns.duplicated() if np.any(mask_duplicated): merge = merge.loc[:, ~mask_duplicated] merge = merge.sort_index(axis=1, level=['symbol', 'field']) merge.index.name = index_name return merge
def append_df(self, df, field_name, overwrite=True): """ Append DataFrame to existing multi-index DataFrame and add corresponding field name. Parameters ---------- df : pd.DataFrame or pd.Series field_name : str or unicode overwrite : bool, optional Whether overwrite existing field. True by default. Notes ----- append_df does not support overwrite. To overwrite a field, you must first do self.remove_fields(), then append_df() again. """ exist_fields = self.data.columns.remove_unused_levels().levels[1] if field_name in exist_fields: if overwrite: self.remove_field(field_name) print("Field [{:s}] is overwritten.".format(field_name)) else: print("Append df failed: name [{:s}] exist. Try another name.". format(field_name)) return df = df.copy() if isinstance(df, pd.DataFrame): pass elif isinstance(df, pd.Series): df = pd.DataFrame(df) else: raise ValueError( "Data to be appended must be pandas format. But we have {}". format(type(df))) the_data = self.data exist_symbols = the_data.columns.levels[0] if len(df.columns) < len(exist_symbols): df2 = pd.DataFrame(index=df.index, columns=exist_symbols, data=np.nan) df2.update(df) df = df2 elif len(df.columns) > len(exist_symbols): df = df.loc[:, exist_symbols] multi_idx = pd.MultiIndex.from_product([exist_symbols, [field_name]]) df.columns = multi_idx the_data = quick_concat( [the_data, df.reindex(the_data.index)], ["symbol", "field"], how="inner") the_data = the_data.sort_index(axis=1) self.data = the_data self._add_field(field_name)
def _merge_data(dfs, index_name='trade_date'): """ Merge data from different APIs into one DataFrame. Parameters ---------- dfs : list of pd.DataFrame Returns ------- merge : pd.DataFrame or None If dfs is empty, return None Notes ----- Align on date index, concatenate on columns (symbol and fields) """ # dfs = [df for df in dfs if df is not None] # 这里用优化后的快速concat方法取代原生pandas的concat方法,在columns较长的情况下有明显提速 # merge = pd.concat(dfs, axis=1, join='outer') merge = quick_concat(dfs, ['symbol', 'field']) # drop duplicated columns. ONE LINE EFFICIENT version mask_duplicated = merge.columns.duplicated() if np.any(mask_duplicated): # print("Duplicated columns found. Dropped.") merge = merge.loc[:, ~mask_duplicated] # if merge.isnull().sum().sum() > 0: # print "WARNING: nan in final merged data. NO fill" # merge.fillna(method='ffill', inplace=True) merge = merge.sort_index(axis=1, level=['symbol', 'field']) merge.index.name = index_name return merge
def append_df(self, df, field_name, is_quarterly=False, overwrite=True): """ Append DataFrame to existing multi-index DataFrame and add corresponding field name. Parameters ---------- df : pd.DataFrame or pd.Series field_name : str or unicode is_quarterly : bool Whether df is quarterly data (like quarterly financial statement) or daily data. overwrite : bool, optional Whether overwrite existing field. True by default. Notes ----- append_df does not support overwrite. To overwrite a field, you must first do self.remove_fields(), then append_df() again. """ if is_quarterly: if self.data_q is None: raise ValueError("append_df前需要先确保季度数据集data_q不为空!") exist_fields = self.data_q.columns.remove_unused_levels().levels[1] else: if self.data_d is None: raise ValueError("append_df前需要先确保日度数据集data_d不为空!") exist_fields = self.data_d.columns.remove_unused_levels().levels[1] if field_name in exist_fields: if overwrite: self.remove_field(field_name) print("Field [{:s}] is overwritten.".format(field_name)) else: print("Append df failed: name [{:s}] exist. Try another name.". format(field_name)) return # 季度添加至data_q 日度添加至data_d df = df.copy() if isinstance(df, pd.DataFrame): pass elif isinstance(df, pd.Series): df = pd.DataFrame(df) else: raise ValueError( "Data to be appended must be pandas format. But we have {}". format(type(df))) if is_quarterly: the_data = self.data_q else: the_data = self.data_d exist_symbols = the_data.columns.levels[0] if len(df.columns) < len(exist_symbols): df2 = pd.DataFrame(index=df.index, columns=exist_symbols, data=np.nan) df2.update(df) df = df2 elif len(df.columns) > len(exist_symbols): df = df.loc[:, exist_symbols] multi_idx = pd.MultiIndex.from_product([exist_symbols, [field_name]]) df.columns = multi_idx # the_data = apply_in_subprocess(pd.merge, args=(the_data, df), # kwargs={'left_index': True, 'right_index': True, 'how': 'left'}) # runs in *only* one process # the_data = pd.merge(the_data, df, left_index=True, right_index=True, how='left') the_data = quick_concat( [the_data, df.reindex(the_data.index)], ["symbol", "field"], how="inner") the_data = the_data.sort_index(axis=1) # merge = the_data.join(df, how='left') # left: keep index of existing data unchanged # sort_columns(the_data) if is_quarterly: self.data_q = the_data else: self.data_d = the_data self._add_field(field_name, is_quarterly)