def test_with_duplicates(self): q = pd.concat([self.quotes, self.quotes]).sort_values( ['time', 'ticker']).reset_index(drop=True) result = merge_asof(self.trades, q, on='time', by='ticker') expected = self.read_data('asof.csv') assert_frame_equal(result, expected) result = merge_asof(self.trades, q, on='time', by='ticker', check_duplicates=False) expected = self.read_data('asof.csv') expected = pd.concat([expected, expected]).sort_values( ['time', 'ticker']).reset_index(drop=True) # the results are not ordered in a meaningful way # nor are the exact matches duplicated, so comparisons # are pretty tricky here, however the uniques are the same def aligner(x, ticker): return (x[x.ticker == ticker] .sort_values(['time', 'ticker', 'quantity', 'price', 'marketCenter', 'bid', 'ask']) .drop_duplicates(keep='last') .reset_index(drop=True) ) for ticker in expected.ticker.unique(): r = aligner(result, ticker) e = aligner(expected, ticker) assert_frame_equal(r, e)
def test_on_specialized_type(self): # GH13936 for dtype in [np.uint8, np.uint16, np.uint32, np.uint64, np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]: df1 = pd.DataFrame({ 'value': [5, 2, 25, 100, 78, 120, 79], 'symbol': list("ABCDEFG")}, columns=['symbol', 'value']) df1.value = dtype(df1.value) df2 = pd.DataFrame({ 'value': [0, 80, 120, 125], 'result': list('xyzw')}, columns=['value', 'result']) df2.value = dtype(df2.value) df1 = df1.sort_values('value').reset_index(drop=True) if dtype == np.float16: with self.assertRaises(MergeError): pd.merge_asof(df1, df2, on='value') continue result = pd.merge_asof(df1, df2, on='value') expected = pd.DataFrame( {'symbol': list("BACEGDF"), 'value': [2, 5, 25, 78, 79, 100, 120], 'result': list('xxxxxyz') }, columns=['symbol', 'value', 'result']) expected.value = dtype(expected.value) assert_frame_equal(result, expected)
def test_allow_exact_matches_and_tolerance2(self): # GH 13695 df1 = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), 'username': ['bob']}) df2 = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.000', '2016-07-15 13:30:00.030']), 'version': [1, 2]}) result = pd.merge_asof(df1, df2, on='time') expected = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), 'username': ['bob'], 'version': [2]}) assert_frame_equal(result, expected) result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False) expected = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), 'username': ['bob'], 'version': [1]}) assert_frame_equal(result, expected) result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, tolerance=pd.Timedelta('10ms')) expected = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), 'username': ['bob'], 'version': [np.nan]}) assert_frame_equal(result, expected)
def test_multiby_indexed(self): # GH15676 left = pd.DataFrame([ [pd.to_datetime('20160602'), 1, 'a'], [pd.to_datetime('20160602'), 2, 'a'], [pd.to_datetime('20160603'), 1, 'b'], [pd.to_datetime('20160603'), 2, 'b']], columns=['time', 'k1', 'k2']).set_index('time') right = pd.DataFrame([ [pd.to_datetime('20160502'), 1, 'a', 1.0], [pd.to_datetime('20160502'), 2, 'a', 2.0], [pd.to_datetime('20160503'), 1, 'b', 3.0], [pd.to_datetime('20160503'), 2, 'b', 4.0]], columns=['time', 'k1', 'k2', 'value']).set_index('time') expected = pd.DataFrame([ [pd.to_datetime('20160602'), 1, 'a', 1.0], [pd.to_datetime('20160602'), 2, 'a', 2.0], [pd.to_datetime('20160603'), 1, 'b', 3.0], [pd.to_datetime('20160603'), 2, 'b', 4.0]], columns=['time', 'k1', 'k2', 'value']).set_index('time') result = pd.merge_asof(left, right, left_index=True, right_index=True, by=['k1', 'k2']) assert_frame_equal(expected, result) with self.assertRaises(MergeError): pd.merge_asof(left, right, left_index=True, right_index=True, left_by=['k1', 'k2'], right_by=['k1'])
def test_examples1(self): """ doc-string examples """ left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) right = pd.DataFrame({'a': [1, 2, 3, 6, 7], 'right_val': [1, 2, 3, 6, 7]}) pd.merge_asof(left, right, on='a')
def test_valid_allow_exact_matches(self): trades = self.trades quotes = self.quotes with self.assertRaises(MergeError): merge_asof(trades, quotes, on='time', by='ticker', allow_exact_matches='foo')
def test_merge_datatype_error(self): """ Tests merge datatype mismatch error """ msg = r'merge keys \[0\] object and int64, must be the same type' left = pd.DataFrame({'left_val': [1, 5, 10], 'a': ['a', 'b', 'c']}) right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], 'a': [1, 2, 3, 6, 7]}) with pytest.raises(MergeError, match=msg): merge_asof(left, right, on='a')
def test_merge_on_nans(self, func, side): # GH 23189 msg = "Merge keys contain null values on {} side".format(side) nulls = func([1.0, 5.0, np.nan]) non_nulls = func([1.0, 5.0, 10.]) df_null = pd.DataFrame({'a': nulls, 'left_val': ['a', 'b', 'c']}) df = pd.DataFrame({'a': non_nulls, 'right_val': [1, 6, 11]}) with pytest.raises(ValueError, match=msg): if side == 'left': merge_asof(df_null, df, on='a') else: merge_asof(df, df_null, on='a')
def test_asof_join_predicate(time_left, time_right, time_df1, time_df2): expr = time_left.asof_join(time_right, time_left.time == time_right.time)[ time_left, time_right.other_value ] result = expr.execute() expected = pd.merge_asof(time_df1, time_df2, on='time') tm.assert_frame_equal(result[expected.columns], expected)
def test_on_specialized_type_by_int(self, any_real_dtype): # see gh-13936 dtype = np.dtype(any_real_dtype).type df1 = pd.DataFrame({ "value": [5, 2, 25, 100, 78, 120, 79], "key": [1, 2, 3, 2, 3, 1, 2], "symbol": list("ABCDEFG")}, columns=["symbol", "key", "value"]) df1.value = dtype(df1.value) df2 = pd.DataFrame({ "value": [0, 80, 120, 125], "key": [1, 2, 2, 3], "result": list("xyzw")}, columns=["value", "key", "result"]) df2.value = dtype(df2.value) df1 = df1.sort_values("value").reset_index(drop=True) result = pd.merge_asof(df1, df2, on="value", by="key") expected = pd.DataFrame({ "symbol": list("BACEGDF"), "key": [2, 1, 3, 3, 2, 2, 1], "value": [2, 5, 25, 78, 79, 100, 120], "result": [np.nan, "x", np.nan, np.nan, np.nan, "y", "x"]}, columns=["symbol", "key", "value", "result"]) expected.value = dtype(expected.value) assert_frame_equal(result, expected)
def test_on_float_by_int(self): # type specialize both "by" and "on" parameters df1 = pd.DataFrame({ 'symbol': list("AAABBBCCC"), 'exch': [1, 2, 3, 1, 2, 3, 1, 2, 3], 'price': [3.26, 3.2599, 3.2598, 12.58, 12.59, 12.5, 378.15, 378.2, 378.25]}, columns=['symbol', 'exch', 'price']) df2 = pd.DataFrame({ 'exch': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'price': [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0], 'mpv': [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0]}, columns=['exch', 'price', 'mpv']) df1 = df1.sort_values('price').reset_index(drop=True) df2 = df2.sort_values('price').reset_index(drop=True) result = pd.merge_asof(df1, df2, on='price', by='exch') expected = pd.DataFrame({ 'symbol': list("AAABBBCCC"), 'exch': [3, 2, 1, 3, 1, 2, 1, 2, 3], 'price': [3.2598, 3.2599, 3.26, 12.5, 12.58, 12.59, 378.15, 378.2, 378.25], 'mpv': [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25]}, columns=['symbol', 'exch', 'price', 'mpv']) assert_frame_equal(result, expected)
def test_multi_index(self): # MultiIndex is prohibited trades = self.trades.set_index(['time', 'price']) quotes = self.quotes.set_index('time') with self.assertRaises(MergeError): merge_asof(trades, quotes, left_index=True, right_index=True) trades = self.trades.set_index('time') quotes = self.quotes.set_index(['time', 'bid']) with self.assertRaises(MergeError): merge_asof(trades, quotes, left_index=True, right_index=True)
def test_allow_exact_matches(self): result = merge_asof(self.trades, self.quotes, on='time', by='ticker', allow_exact_matches=False) expected = self.allow_exact_matches assert_frame_equal(result, expected)
def test_with_duplicates(self): q = pd.concat([self.quotes, self.quotes]).sort_values( ['time', 'ticker']).reset_index(drop=True) result = merge_asof(self.trades, q, on='time', by='ticker') expected = self.read_data('asof.csv') assert_frame_equal(result, expected)
def test_allow_exact_matches_and_tolerance(self): result = merge_asof(self.trades, self.quotes, on='time', by='ticker', tolerance=Timedelta('100ms'), allow_exact_matches=False) expected = self.allow_exact_matches_and_tolerance assert_frame_equal(result, expected)
def test_on_and_index(self): # 'on' parameter and index together is prohibited trades = self.trades.set_index('time') quotes = self.quotes.set_index('time') with self.assertRaises(MergeError): merge_asof(trades, quotes, left_on='price', left_index=True, right_index=True) trades = self.trades.set_index('time') quotes = self.quotes.set_index('time') with self.assertRaises(MergeError): merge_asof(trades, quotes, right_on='bid', left_index=True, right_index=True)
def test_multiby(self): # GH13936 trades = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.023', '20160525 13:30:00.023', '20160525 13:30:00.046', '20160525 13:30:00.048', '20160525 13:30:00.050']), 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'], 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], 'price': [51.95, 51.95, 720.77, 720.92, 98.00], 'quantity': [75, 155, 100, 100, 100]}, columns=['time', 'ticker', 'exch', 'price', 'quantity']) quotes = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.023', '20160525 13:30:00.023', '20160525 13:30:00.030', '20160525 13:30:00.041', '20160525 13:30:00.045', '20160525 13:30:00.049']), 'ticker': ['GOOG', 'MSFT', 'MSFT', 'MSFT', 'GOOG', 'AAPL'], 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA', 'NSDQ', 'ARCA'], 'bid': [720.51, 51.95, 51.97, 51.99, 720.50, 97.99], 'ask': [720.92, 51.96, 51.98, 52.00, 720.93, 98.01]}, columns=['time', 'ticker', 'exch', 'bid', 'ask']) expected = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.023', '20160525 13:30:00.023', '20160525 13:30:00.046', '20160525 13:30:00.048', '20160525 13:30:00.050']), 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'], 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], 'price': [51.95, 51.95, 720.77, 720.92, 98.00], 'quantity': [75, 155, 100, 100, 100], 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan], 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]}, columns=['time', 'ticker', 'exch', 'price', 'quantity', 'bid', 'ask']) result = pd.merge_asof(trades, quotes, on='time', by=['ticker', 'exch']) assert_frame_equal(result, expected)
def test_basic(self): expected = self.asof trades = self.trades quotes = self.quotes result = merge_asof(trades, quotes, on='time', by='ticker') assert_frame_equal(result, expected)
def test_basic2(self): expected = self.read_data('asof2.csv') trades = self.read_data('trades2.csv') quotes = self.read_data('quotes2.csv', dedupe=True) result = merge_asof(trades, quotes, on='time', by='ticker') assert_frame_equal(result, expected)
def test_keyed_asof_join( time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 ): expr = time_keyed_left.asof_join(time_keyed_right, 'time', by='key')[ time_keyed_left, time_keyed_right.other_value ] result = expr.execute() expected = pd.merge_asof( time_keyed_df1, time_keyed_df2, on='time', by='key' ) tm.assert_frame_equal(result[expected.columns], expected)
def test_tolerance(self): trades = self.trades quotes = self.quotes result = merge_asof(trades, quotes, on='time', by='ticker', tolerance=Timedelta('1day')) expected = self.tolerance assert_frame_equal(result, expected)
def test_basic_left_index_right_index(self): expected = self.asof.set_index('time') trades = self.trades.set_index('time') quotes = self.quotes.set_index('time') result = merge_asof(trades, quotes, left_index=True, right_index=True, by='ticker') assert_frame_equal(result, expected)
def test_with_duplicates_no_on(self): df1 = pd.DataFrame({'key': [1, 1, 3], 'left_val': [1, 2, 3]}) df2 = pd.DataFrame({'key': [1, 2, 2], 'right_val': [1, 2, 3]}) result = merge_asof(df1, df2, on='key') expected = pd.DataFrame({'key': [1, 1, 3], 'left_val': [1, 2, 3], 'right_val': [1, 1, 3]}) assert_frame_equal(result, expected)
def test_basic_categorical(self): expected = self.asof trades = self.trades.copy() trades.ticker = trades.ticker.astype('category') quotes = self.quotes.copy() quotes.ticker = quotes.ticker.astype('category') result = merge_asof(trades, quotes, on='time', by='ticker') assert_frame_equal(result, expected)
def test_index_tolerance(self): # GH 15135 expected = self.tolerance.set_index('time') trades = self.trades.set_index('time') quotes = self.quotes.set_index('time') result = pd.merge_asof(trades, quotes, left_index=True, right_index=True, by='ticker', tolerance=pd.Timedelta('1day')) assert_frame_equal(result, expected)
def test_missing_right_by(self): expected = self.asof trades = self.trades quotes = self.quotes q = quotes[quotes.ticker != 'MSFT'] result = merge_asof(trades, q, on='time', by='ticker') expected.loc[expected.ticker == 'MSFT', ['bid', 'ask']] = np.nan assert_frame_equal(result, expected)
def test_basic_no_by(self): f = lambda x: x[x.ticker == 'MSFT'].drop('ticker', axis=1) \ .reset_index(drop=True) # just use a single ticker expected = f(self.asof) trades = f(self.trades) quotes = f(self.quotes) result = merge_asof(trades, quotes, on='time') assert_frame_equal(result, expected)
def test_basic_left_by_right_by(self): # GH14253 expected = self.asof trades = self.trades quotes = self.quotes result = merge_asof(trades, quotes, on='time', left_by='ticker', right_by='ticker') assert_frame_equal(result, expected)
def test_non_sorted(self): trades = self.trades.sort_values('time', ascending=False) quotes = self.quotes.sort_values('time', ascending=False) # we require that we are already sorted on time & quotes assert not trades.time.is_monotonic assert not quotes.time.is_monotonic with pytest.raises(ValueError): merge_asof(trades, quotes, on='time', by='ticker') trades = self.trades.sort_values('time') assert trades.time.is_monotonic assert not quotes.time.is_monotonic with pytest.raises(ValueError): merge_asof(trades, quotes, on='time', by='ticker') quotes = self.quotes.sort_values('time') assert trades.time.is_monotonic assert quotes.time.is_monotonic # ok, though has dupes merge_asof(trades, self.quotes, on='time', by='ticker')
def test_non_sorted(self): trades = self.trades.sort_values('time', ascending=False) quotes = self.quotes.sort_values('time', ascending=False) # we require that we are already sorted on time & quotes self.assertFalse(trades.time.is_monotonic) self.assertFalse(quotes.time.is_monotonic) with self.assertRaises(ValueError): merge_asof(trades, quotes, on='time', by='ticker') trades = self.trades.sort_values('time') self.assertTrue(trades.time.is_monotonic) self.assertFalse(quotes.time.is_monotonic) with self.assertRaises(ValueError): merge_asof(trades, quotes, on='time', by='ticker') quotes = self.quotes.sort_values('time') self.assertTrue(trades.time.is_monotonic) self.assertTrue(quotes.time.is_monotonic) # ok, though has dupes merge_asof(trades, self.quotes, on='time', by='ticker')
def time_on_uint64(self, direction, tolerance): merge_asof(self.df1f, self.df2f, on="timeu64", direction=direction, tolerance=tolerance)
def parse_gpsapilog_to_df_v2(filename): """Parse GPS API log to Pandas dataframes, by using merge_asof. Args: filename: full log file name. Type, String. Returns: timestamp_df: Timestamp Data Frame. Type, Pandas DataFrame. sv_info_df: GNSS SV info Data Frame. Type, Pandas DataFrame. sv_stat_df: GNSS SV statistic Data Frame. Type, Pandas DataFrame loc_info_df: Location Information Data Frame. Type, Pandas DataFrame. include Provider, Latitude, Longitude, Altitude, GNSSTime, Speed, Bearing """ # Get parsed dataframe list parsed_data = parse_log_to_df( filename=filename, configs=CONFIG_GPSAPILOG, ) # get DUT Timestamp timestamp_df = parsed_data['phone_time'] timestamp_df['phone_time'] = timestamp_df.apply( lambda row: datetime.datetime.strptime(row.date + '-' + row.time, '%Y/%m/%d-%H:%M:%S'), axis=1) # drop logsize, date, time parsed_data['phone_time'] = timestamp_df.drop(['logsize', 'date', 'time'], axis=1) # Add phone_time from timestamp dataframe by row number for key in parsed_data: if (key != 'phone_time') and (not parsed_data[key].empty): parsed_data[key] = pds.merge_asof(parsed_data[key], parsed_data['phone_time'], left_index=True, right_index=True) # Get space vehicle info dataframe # Handle the pre GPSTool 2.12.24 case if not parsed_data['SpaceVehicle'].empty: sv_info_df = parsed_data['SpaceVehicle'] # Handle the post GPSTool 2.12.24 case with baseband CNo elif not parsed_data['SpaceVehicle_wBB'].empty: sv_info_df = parsed_data['SpaceVehicle_wBB'] # Get space vehicle statistics dataframe # Handle the pre GPSTool 2.12.24 case if not parsed_data['HistoryAvgTop4CNo'].empty: # First merge all dataframe from LIST_SVSTAT[1:], sv_stat_df = fts.reduce( lambda item1, item2: pds.merge(item1, item2, on='phone_time'), [parsed_data[key] for key in LIST_SVSTAT[1:]]) # Then merge with LIST_SVSTAT[0] sv_stat_df = pds.merge(sv_stat_df, parsed_data[LIST_SVSTAT[0]], on='phone_time') # Handle the post GPSTool 2.12.24 case with baseband CNo elif not parsed_data['AntennaHistoryAvgTop4CNo'].empty: # First merge all dataframe from LIST_SVSTAT[1:], sv_stat_df = fts.reduce( lambda item1, item2: pds.merge(item1, item2, on='phone_time'), [parsed_data[key] for key in LIST_SVSTAT_WBB[1:]]) # Then merge with LIST_SVSTAT[0] sv_stat_df = pds.merge(sv_stat_df, parsed_data[LIST_SVSTAT_WBB[0]], on='phone_time') # Get location fix information dataframe # First merge all dataframe from LIST_LOCINFO[1:], loc_info_df = fts.reduce( lambda item1, item2: pds.merge(item1, item2, on='phone_time'), [parsed_data[key] for key in LIST_LOCINFO[1:]]) # Then merge with LIST_LOCINFO[8] loc_info_df = pds.merge(loc_info_df, parsed_data[LIST_LOCINFO[0]], on='phone_time') # Convert GNSS Time loc_info_df['gnsstime'] = loc_info_df.apply( lambda row: datetime.datetime.strptime(row.Date + '-' + row.Time, '%Y/%m/%d-%H:%M:%S'), axis=1) # Data Conversion timestamp_df['logsize'] = timestamp_df['logsize'].astype(int) sv_info_df['SV'] = sv_info_df['SV'].astype(int) sv_info_df['Elevation'] = sv_info_df['Elevation'].astype(float) sv_info_df['Azimuth'] = sv_info_df['Azimuth'].astype(float) sv_info_df['Frequency'] = sv_info_df['Frequency'].astype(float) if 'CNo' in list(sv_info_df.columns): sv_info_df['CNo'] = sv_info_df['CNo'].astype(float) sv_info_df['AntCNo'] = sv_info_df['CNo'] elif 'AntCNo' in list(sv_info_df.columns): sv_info_df['AntCNo'] = sv_info_df['AntCNo'].astype(float) sv_info_df['BbCNo'] = sv_info_df['BbCNo'].astype(float) if 'CurrentAvgTop4CNo' in list(sv_stat_df.columns): sv_stat_df['CurrentAvgTop4CNo'] = sv_stat_df[ 'CurrentAvgTop4CNo'].astype(float) sv_stat_df['CurrentAvgCNo'] = sv_stat_df['CurrentAvgCNo'].astype(float) sv_stat_df['HistoryAvgTop4CNo'] = sv_stat_df[ 'HistoryAvgTop4CNo'].astype(float) sv_stat_df['HistoryAvgCNo'] = sv_stat_df['HistoryAvgCNo'].astype(float) sv_stat_df['AntennaCurrentAvgTop4CNo'] = sv_stat_df[ 'CurrentAvgTop4CNo'] sv_stat_df['AntennaCurrentAvgCNo'] = sv_stat_df['CurrentAvgCNo'] sv_stat_df['AntennaHistoryAvgTop4CNo'] = sv_stat_df[ 'HistoryAvgTop4CNo'] sv_stat_df['AntennaHistoryAvgCNo'] = sv_stat_df['HistoryAvgCNo'] sv_stat_df['BasebandCurrentAvgTop4CNo'] = npy.nan sv_stat_df['BasebandCurrentAvgCNo'] = npy.nan sv_stat_df['BasebandHistoryAvgTop4CNo'] = npy.nan sv_stat_df['BasebandHistoryAvgCNo'] = npy.nan elif 'AntennaCurrentAvgTop4CNo' in list(sv_stat_df.columns): sv_stat_df['AntennaCurrentAvgTop4CNo'] = sv_stat_df[ 'AntennaCurrentAvgTop4CNo'].astype(float) sv_stat_df['AntennaCurrentAvgCNo'] = sv_stat_df[ 'AntennaCurrentAvgCNo'].astype(float) sv_stat_df['AntennaHistoryAvgTop4CNo'] = sv_stat_df[ 'AntennaHistoryAvgTop4CNo'].astype(float) sv_stat_df['AntennaHistoryAvgCNo'] = sv_stat_df[ 'AntennaHistoryAvgCNo'].astype(float) sv_stat_df['BasebandCurrentAvgTop4CNo'] = sv_stat_df[ 'BasebandCurrentAvgTop4CNo'].astype(float) sv_stat_df['BasebandCurrentAvgCNo'] = sv_stat_df[ 'BasebandCurrentAvgCNo'].astype(float) sv_stat_df['BasebandHistoryAvgTop4CNo'] = sv_stat_df[ 'BasebandHistoryAvgTop4CNo'].astype(float) sv_stat_df['BasebandHistoryAvgCNo'] = sv_stat_df[ 'BasebandHistoryAvgCNo'].astype(float) sv_stat_df['L5EngagingRate'] = sv_stat_df['L5EngagingRate'].astype(float) loc_info_df['Latitude'] = loc_info_df['Latitude'].astype(float) loc_info_df['Longitude'] = loc_info_df['Longitude'].astype(float) loc_info_df['Altitude'] = loc_info_df['Altitude'].astype(float) loc_info_df['Speed'] = loc_info_df['Speed'].astype(float) loc_info_df['Bearing'] = loc_info_df['Bearing'].astype(float) return timestamp_df, sv_info_df, sv_stat_df, loc_info_df
def calc_estimates(): #calculate signals from earnings estimate date (IBES), including earnings surprises, fwd P/E, change in analyst coverage, fwd EPS growth ibes = get_ibes_summary() #keep estimates for all fwd financial periods which will be used later to calculate fwd P/E ratios all_fpi = ibes.loc[ibes.fpi.isin( ['0', '1', '2', '3', '4', '6', '7', '8', '9'])] #lag by a month to align with t+1 return all_fpi['date'] = pd.to_datetime( all_fpi['statpers']) + DateOffset(months=1) #reshape to create columns for each fwd financial period, indexed by company/date all_fpi = all_fpi[['ticker', 'date', 'fpi', 'meanest']].pivot_table(index=['ticker', 'date'], columns=['fpi'], values='meanest', aggfunc=np.sum) all_fpi = all_fpi.sort_values(by=['date', 'ticker']) #rename columns replace_dict = dict( zip([str(i) for i in range(5)] + [str(i) for i in range(6, 10)], ['ltg'] + ['ann' + str(i) for i in range(1, 5)] + ['qtr' + str(i) for i in range(1, 5)])) all_fpi.columns = [replace_dict[x] for x in all_fpi.columns] #subset ibes data to 1 qtr fwd EPS estimate and select the most recent estimate before the earnings date ibes = ibes.loc[(ibes.measure == 'EPS') & (ibes.fpi == '6')] ibes['fpedats'] = pd.to_datetime(ibes.fpedats) ibes = ibes.sort_values(by=['statpers', 'ticker']) ibes = ibes.groupby(['fpedats', 'ticker'])[[ 'statpers', 'meanest', 'medest', 'stdev', 'numest' ]].last().reset_index() #merge on the actual earnings release figures actuals = get_ibes_actual() actuals = actuals.rename(columns={'value': 'actual'}) actuals = actuals.loc[(actuals.measure == 'EPS') & (actuals.pdicity == 'QTR')] actuals['fpedats'] = pd.to_datetime(actuals.pends) ibes = ibes.merge(actuals[['fpedats', 'ticker', 'actual', 'anndats']], how='left', on=['fpedats', 'ticker']) #set the date index to the earnings releast date and lag by 2 business days + 1 month to align with t+1 return ibes['anndats'] = pd.to_datetime(ibes['anndats']) ibes['date'] = ibes['anndats'] + BDay(2) + MonthBegin(1) ibes = ibes.loc[pd.notnull(ibes.anndats)] ibes = ibes.sort_values(by=['date', 'ticker']).drop_duplicates( subset=['ticker', 'date']) #calculate standardized unexpected earnings (SUE) and earnings surprise z-score ibes['sue'] = (ibes['actual'] - ibes['meanest']) / ibes['stdev'] ibes['sue'] = ibes['sue'].where(ibes.stdev != 0, np.nan) ibes['surprise'] = ibes['actual'] - ibes['meanest'] ibes['surprise_z'] = ibes.groupby('ticker')['surprise'].rolling( 8, min_periods=6).apply(lambda x: (x[-1] - x.mean()) / x.std(), raw=True).reset_index(0, drop=True) #calculate change in analyst coverage ibes['numest_lag'] = ibes.groupby('ticker')['numest'].shift(1) ibes['chanalyst'] = ibes['numest'] - ibes['numest_lag'] ibes['pchanalyst'] = ibes['numest'] / ibes['numest_lag'] - 1 #characteristics to output cols = ['sue', 'surprise_z', 'chanalyst', 'pchanalyst'] #get crsp file, merge with IBES linking table, and merge characteristics crsp = get_crsp_m() crsp = crsp.sort_values(by=['date', 'permno']) link = get_crspibes_link() crsp = crsp.merge(link[['permno', 'ticker', 'sdate', 'edate']], how='left', on='permno') crsp['date'] = pd.to_datetime(crsp.date) crsp['ticker'] = crsp.ticker.where( ~((crsp.date < crsp.sdate) | (crsp.date > crsp.edate)), np.nan) crsp['date'] = pd.to_datetime(crsp.date) - MonthBegin(1) crsp = crsp.dropna(subset=['ticker']).drop_duplicates( subset=['permno', 'date']) crsp = crsp.merge(ibes[['ticker', 'date'] + cols], how='left', on=['ticker', 'date']) #merge all fwd earning period estimates onto crsp data crsp = pd.merge_asof(crsp, all_fpi, on='date', by=['ticker']) #fwd earnings yield at various horizons crsp['prc_lag'] = crsp.groupby('permno')['prc'].shift(1) crsp['prc_lag'] = crsp.prc_lag.abs() crsp['pe0'] = crsp['ann1'] / crsp['prc_lag'] crsp['pe1'] = crsp['ann2'] / crsp['prc_lag'] crsp['pe2'] = crsp['ann3'] / crsp['prc_lag'] crsp['pe3'] = crsp['ann4'] / crsp['prc_lag'] crsp['pe4'] = crsp['qtr1'] / crsp['prc_lag'] crsp['pe5'] = crsp[['qtr1', 'qtr2', 'qtr3', 'qtr4' ]].sum(axis=1) / crsp['prc_lag'] #add to list of characteristics to output cols += ['pe' + str(x) for x in range(6)] + ['ltg'] output_characteristics(crsp, cols) with open(data_output_path + 'characteristics_output_log.txt', 'a') as f: f.write('estimates\n')