def test_join_multiindex(self): index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=['var X']) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=['var Y']) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names
def test_join_multi_to_multi(self, join_type): # GH 20475 leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num']) left = DataFrame({'v1': range(12)}, index=leftindex) rightindex = MultiIndex.from_product([list('abc'), list('xy')], names=['abc', 'xy']) right = DataFrame({'v2': [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=['abc', 'xy'], how=join_type) expected = (left.reset_index() .merge(right.reset_index(), on=['abc', 'xy'], how=join_type) .set_index(['abc', 'xy', 'num']) ) assert_frame_equal(expected, result) msg = (r'len\(left_on\) must equal the number of levels in the index' ' of "right"') with pytest.raises(ValueError, match=msg): left.join(right, on='xy', how=join_type) with pytest.raises(ValueError, match=msg): right.join(left, on=['abc', 'xy'], how=join_type)
def plots_workingTrends(): # holiday = 0 and workday = 0 => weekend # let's see if holidays and weekends give the same trends # Day trends -- working vs. non-working day hours = np.linspace(0,23,24) days_average = DataFrame({'Hour': hours}) # workdays mean_vec = [] for hour in hours: mean_vec.append(bike_data[ (bike_data["workingday"] == 1) & (bike_data["time"] == hour) ].mean()['count']) days_average = days_average.join(DataFrame({'Working day': mean_vec})) # holidays or weekends mean_vec = [] for hour in hours: mean_vec.append(bike_data[ (bike_data["workingday"] == 0) & (bike_data["time"] == hour) ].mean()['count']) days_average = days_average.join(DataFrame({'Non-working day': mean_vec})) days_average.drop('Hour',axis=1).plot(figsize=(12, 6), linewidth=3, fontsize=16) plt.xlabel('Hour', fontsize=16) plt.ylabel('Average counts', fontsize=16) plt.legend(loc='best', fontsize=16) plt.show()
def test_join_on_singlekey_list(self): df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) # corner cases joined = df.join(df2, on=['key']) expected = df.join(df2, on='key') assert_frame_equal(joined, expected)
def test_join_segfault(self): # 1532 df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]}) df2 = DataFrame({'a': [2, 2], 'b': [1, 2], 'y': [1, 2]}) df1 = df1.set_index(['a', 'b']) df2 = df2.set_index(['a', 'b']) # it works! for how in ['left', 'right', 'outer']: df1.join(df2, how=how)
def test_join_unconsolidated(self): # GH #331 a = DataFrame(randn(30, 2), columns=['a', 'b']) c = Series(randn(30)) a['c'] = c d = DataFrame(randn(30, 1), columns=['q']) # it works! a.join(d) d.join(a)
class JoinIndex(object): def setup(self): N = 50000 self.left = DataFrame(np.random.randint(1, N / 500, (N, 2)), columns=['jim', 'joe']) self.right = DataFrame(np.random.randint(1, N / 500, (N, 2)), columns=['jolie', 'jolia']).set_index('jolie') def time_left_outer_join_index(self): self.left.join(self.right, on='jim')
def test_join_on_inner(self): df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) joined = df.join(df2, on='key', how='inner') expected = df.join(df2, on='key') expected = expected[expected['value'].notna()] tm.assert_series_equal(joined['key'], expected['key'], check_dtype=False) tm.assert_series_equal(joined['value'], expected['value'], check_dtype=False) tm.assert_index_equal(joined.index, expected.index)
def merge_with_technicals(currency_list, returns_table, fundamentals_table, RSI, MACD, Stochastics, beg_date, stoch_date): # Create empty list, will hold dataframes for all currencies dataframe_list = [] for currency in currency_list: buildup_dataframe = DataFrame(returns_table[currency]) buildup_dataframe = buildup_dataframe.join(fundamentals_table, how= 'left', rsuffix= '') buildup_dataframe = buildup_dataframe.join(RSI[currency], how= 'left', rsuffix= '_RSI') buildup_dataframe = buildup_dataframe.join(MACD[currency], how='left', rsuffix='_MACD') if beg_date > stoch_date: buildup_dataframe = buildup_dataframe.join(Stochastics[currency], how='left', rsuffix='_Stoch') dataframe_list.append(buildup_dataframe) return dataframe_list
def read_data(test = False): if (test): filename = 'test.csv' else: filename = 'train.csv' # read data; output: dataframe data = pd.read_csv(filename) # split datetime into date and time date = [] time = [] for row in data['datetime']: row = row.split() date.append(row[0]) time.append(int(row[1].split(':')[0])) date_and_time = DataFrame({'date': date, 'time': time}) del data['datetime'] data = date_and_time.join(data) # add day of the week day = [] # https://docs.python.org/2/library/datetime.html # .strftime('%A') -- sets proper format for row in data['date']: day.append(datetime.datetime.strptime(row, '%Y-%m-%d').strftime('%A')) data = DataFrame({'day': day}).join(data) # split date into year | month | dayMonth year = [] month = [] dayMonth = [] for row in data['date']: row = row.split('-') year.append(int(row[0])) month.append(int(row[1])) dayMonth.append(int(row[2])) year_month_day = DataFrame({'year' : year, 'month': month, 'dayMonth' : dayMonth}) del data['date'] data = year_month_day.join(data) return data
def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='outer') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() expected = expected[result.columns] expected['a'] = expected.a.astype('int64') expected['b'] = expected.b.astype('int64') assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame( {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='inner') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') result = result.reset_index() assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name='TEST') inner = df.join(s, how='inner') outer = df.join(s, how='outer') left = df.join(s, how='left') right = df.join(s, how='right') assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right)
def parse_GDS_columns(lines, subsets): """Parse list of line with columns description from SOFT file of GDS (GEO Dataset) :param lines: iterable -- iterator over lines :returns: pandas.DataFrame -- columns description """ data = [] index = [] for line in lines: line = line.rstrip() if line.startswith("#"): tmp = __parse_entry(line) data.append(tmp[1]) index.append(tmp[0]) df = DataFrame(data, index=index, columns=['description']) subset_ids = {"disease_state": {}, "individual": {}} for subsetname, subset in subsets.iteritems(): for expid in subset.metadata["sample_id"][0].split(","): if subset.get_type() == "disease state": subset_ids["disease_state"][expid] = subset.metadata["description"][0] elif subset.get_type() == "individual": subset_ids["individual"][expid] = subset.metadata["description"][0] else: stderr("Unknown subset type: %s for subset %s\n" % (subset.get_type(), subsetname)) return df.join(DataFrame(subset_ids))
def test_join_sort(self): left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 4]}) right = DataFrame({'value2': ['a', 'b', 'c']}, index=['bar', 'baz', 'foo']) joined = left.join(right, on='key', sort=True) expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], 'value': [2, 3, 1, 4], 'value2': ['a', 'b', 'c', 'c']}, index=[1, 2, 0, 3]) assert_frame_equal(joined, expected) # smoke test joined = left.join(right, on='key', sort=False) tm.assert_index_equal(joined.index, pd.Index(lrange(4)))
def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) expected = DataFrame({'a': [1, 2, 3, 3, 4], 'b': [5, np.nan, 6, 7, np.nan]}, index=[1, 2, 3, 3, 'a']) tm.assert_frame_equal(result, expected) df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]}, index=[1, 2, 2, 'a']) tm.assert_frame_equal(result, expected)
def to_dataframe(self, selected_fields=None, excluded_fields=None): from ..services import locations if excluded_fields: qs = self.exclude(*excluded_fields) else: qs = self.exclude(*self.DEFAULT_EXCLUDED_FIELDS) if selected_fields: qs = self.only(*selected_fields) df = DataFrame(list(qs.as_pymongo())).convert_objects(convert_numeric=True) if df.empty: return df # add fields with no values fields = filter( lambda f: f not in df.columns, map(lambda field: field.name, [field for group in self.first().form.groups for field in group.fields]), ) for field in fields: df[field] = Series(np.nan, index=df.index) # do cleanup of subdocument fields for field in self.SUBDOCUMENT_FIELDS: temp = df.pop(field).tolist() temp2 = [i if not isnull(i) else {} for i in temp] df = df.join(DataFrame(temp2)) rv_map = locations.registered_voters_map() df["registered_voters"] = df.location.apply(lambda i: rv_map.get(i, 0)) return df
def foreach_dataframe(self, func, force_dict=False, *args, **kwargs): """ Really just does a foreach with each being dfs in a panel. """ d = {} for key, df in self.items(): d[key] = func(df, *args, **kwargs) container = PanelDict for key, result in list(d.items()): if isinstance(result, Series): container = DataFrame break if isinstance(result, DataFrame): container = Panel break index = [] for key, result in list(d.items()): if not isinstance(result, (DataFrame, Series)): continue result.name = key ind = result.index index = set(index).union(ind) if force_dict: return PanelDict(d) res = DataFrame(None, index=index) for key, result in list(d.items()): res = res.join(result) res = res.sort() return res
def test_join_aware(self): rng = date_range('1/1/2011', periods=10, freq='H') ts = Series(np.random.randn(len(rng)), index=rng) ts_utc = ts.tz_localize('utc') self.assertRaises(Exception, ts.__add__, ts_utc) self.assertRaises(Exception, ts_utc.__add__, ts) test1 = DataFrame(np.zeros((6,3)), index=date_range("2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central")) test2 = DataFrame(np.zeros((3,3)), index=date_range("2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central"), columns=range(3,6)) result = test1.join(test2, how='outer') ex_index = test1.index.union(test2.index) self.assertTrue(result.index.equals(ex_index)) self.assertTrue(result.index.tz.zone == 'US/Central') # non-overlapping rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", tz="US/Eastern") result = rng.union(rng2) self.assertTrue(result.tz.zone == 'UTC')
def saveGrid(self,output): arq = open(output+'.txt', "w") arq.write(self.output) arq.close() dfCoulomb = DataFrame(self.coulombMatrix, columns = self.cCoulomb, index = self.molecules) dfLj = DataFrame(self.ljMatrix, columns = self.cLJ, index = self.molecules) df = dfCoulomb.join(dfLj) df.to_csv(output+'.csv', sep =';')
def test_join_index_mixed(self, join_type): # no overlapping blocks df1 = DataFrame(index=np.arange(10)) df1['bool'] = True df1['string'] = 'foo' df2 = DataFrame(index=np.arange(5, 15)) df2['int'] = 1 df2['float'] = 1. joined = df1.join(df2, how=join_type) expected = _join_by_hand(df1, df2, how=join_type) assert_frame_equal(joined, expected) joined = df2.join(df1, how=join_type) expected = _join_by_hand(df2, df1, how=join_type) assert_frame_equal(joined, expected)
def test_left_join_index_preserve_order(self): on_cols = ['k1', 'k2'] left = DataFrame({'k1': [0, 1, 2] * 8, 'k2': ['foo', 'bar'] * 12, 'v': np.array(np.arange(24), dtype=np.int64)}) index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) right = DataFrame({'v2': [5, 7]}, index=index) result = left.join(right, on=on_cols) expected = left.copy() expected['v2'] = np.nan expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 tm.assert_frame_equal(result, expected) result.sort_values(on_cols, kind='mergesort', inplace=True) expected = left.join(right, on=on_cols, sort=True) tm.assert_frame_equal(result, expected) # test join with multi dtypes blocks left = DataFrame({'k1': [0, 1, 2] * 8, 'k2': ['foo', 'bar'] * 12, 'k3': np.array([0, 1, 2] * 8, dtype=np.float32), 'v': np.array(np.arange(24), dtype=np.int32)}) index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) right = DataFrame({'v2': [5, 7]}, index=index) result = left.join(right, on=on_cols) expected = left.copy() expected['v2'] = np.nan expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 tm.assert_frame_equal(result, expected) result = result.sort_values(on_cols, kind='mergesort') expected = left.join(right, on=on_cols, sort=True) tm.assert_frame_equal(result, expected)
def dataframe(self): tss = self.eval() df = DataFrame() # FIXME: should do something about potential for dupe names for ts,h in zip(tss, self.hidden): if not h and type(ts) != type(''): df = df.join(ts,how='outer') return df
def test_left_join_index_multi_match(self): left = DataFrame([ ['c', 0], ['b', 1], ['a', 2], ['b', 3]], columns=['tag', 'val'], index=[2, 0, 1, 3]) right = (DataFrame([ ['a', 'v'], ['c', 'w'], ['c', 'x'], ['d', 'y'], ['a', 'z'], ['c', 'r'], ['e', 'q'], ['c', 's']], columns=['tag', 'char']) .set_index('tag')) result = left.join(right, on='tag', how='left') expected = DataFrame([ ['c', 0, 'w'], ['c', 0, 'x'], ['c', 0, 'r'], ['c', 0, 's'], ['b', 1, nan], ['a', 2, 'v'], ['a', 2, 'z'], ['b', 3, nan]], columns=['tag', 'val', 'char'], index=[2, 2, 2, 2, 0, 1, 1, 3]) tm.assert_frame_equal(result, expected) result = left.join(right, on='tag', how='left', sort=True) expected2 = expected.sort_values('tag', kind='mergesort') tm.assert_frame_equal(result, expected2) # GH7331 - maintain left frame order in left merge result = merge(left, right.reset_index(), how='left', on='tag') expected.index = np.arange(len(expected)) tm.assert_frame_equal(result, expected)
def test_join_on_series_buglet(self): # GH #638 df = DataFrame({'a': [1, 1]}) ds = Series([2], index=[1], name='b') result = df.join(ds, on='a') expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected)
def encode_onehot(df: pd.DataFrame, cols): vec = DictVectorizer() vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(outtype='records')).toarray()) vec_data.columns = vec.get_feature_names() vec_data.index = df.index df = df.drop(cols, axis=1) df = df.join(vec_data) return df
def test_join_str_datetime(self): str_dates = ['20120209', '20120222'] dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] A = DataFrame(str_dates, index=lrange(2), columns=['aa']) C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates) tst = A.join(C, on='aa') assert len(tst.columns) == 3
def get_results_df(db, rev): """Takes a git commit hash and returns a Dataframe of benchmark results """ bench = DataFrame(db.get_benchmarks()) results = DataFrame(map(list,db.get_rev_results(rev).values())) # Sinch vbench.db._reg_rev_results returns an unlabeled dict, # we have to break encapsulation a bit. results.columns = db._results.c.keys() results = results.join(bench['name'], on='checksum').set_index("checksum") return results
def runnig_check(): result = DataFrame() tmp = DataFrame() for i in range(0,3): if i == 0: result = make_keti_data_to_df(i) else: tmp = result result = tmp.join(make_keti_data_to_df(i)) time.sleep(2) return result
def _read_tsq(self, event_name): """Read the metadata (TSQ) file of a TDT Tank. Returns ------- b : pandas.DataFrame Recording metadata """ # create the path name tsq_name = self.path + os.extsep + self.header_ext # read in the raw data as a numpy rec array and convert to DataFrame b = DataFrame(np.fromfile(tsq_name, dtype=self.tsq_dtype)) # zero based indexing b.channel -= 1 b.channel = b.channel.astype(f8) # -1s are invalid b.channel[b.channel == -1] = np.nan b.type = EventTypes[b.type].reset_index(drop=True) b.format = DataTypes[b.format].reset_index(drop=True) b.timestamp[np.logical_not(b.timestamp)] = np.nan b.fs[np.logical_not(b.fs)] = np.nan # fragile subtraction (i.e., what if TDT changes this value?) b.size -= 10 # create some new indices based on the electrode array srt = Indexer.sort('channel').reset_index(drop=True) shank = srt.shank[b.channel].reset_index(drop=True) tsq = b.join(shank) # convert the event_name to a number name = name2num(event_name) # get the row of the metadata where its value equals the name-number row = tsq.name == name # make sure there's at least one event assert row.any(), 'no event named %s in tank: %s' % (event_name, self.path) # get all the metadata for those events tsq = tsq[row] # convert to integer where possible tsq.channel = tsq.channel.astype(int) tsq.shank = tsq.shank.astype(int) return tsq, row
def plots_casRegTrends(): hours = np.linspace(0,23,24) days_average = DataFrame({'Hour': hours}) mean_vec = [] for hour in hours: mean_vec.append(bike_data[ (bike_data["time"] == hour) ].mean()['casual']) days_average = days_average.join(DataFrame({'Casual': mean_vec})) mean_vec = [] for hour in hours: mean_vec.append(bike_data[ (bike_data["time"] == hour) ].mean()['registered']) days_average = days_average.join(DataFrame({'Registered': mean_vec})) days_average.drop('Hour',axis=1).plot(figsize=(12, 6), linewidth=3, fontsize=16) plt.xlabel('Hour', fontsize=16) plt.ylabel('Average counts', fontsize=16) plt.legend(loc='best', fontsize=16) plt.show()
def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range('2016-01-01', periods=16, freq='M') df = DataFrame([i for i in range(len(index))], index=index, columns=['pnum']) df2 = concat([df, df]) result = df.join(df2, how='inner', rsuffix='_df2') expected = DataFrame( np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), columns=['pnum', 'pnum_df2'], index=df2.sort_index().index) tm.assert_frame_equal(result, expected)
#scaler = MinMaxScaler(feature_range=(0, 1)) #scaler2 = MinMaxScaler(feature_range=(0, 1)) scale_X =df.loc[:,["Daily_data","Hourly_data","Monthly_data","Pre_year_data"]] scale_Y =df.loc[:,["Label_year_data"]] scalerX = scaler.fit(scale_X) scalery = scaler.fit(scale_Y) scaled_X = scalerX.transform(scale_X) scaled_X = DataFrame(scaled_X) scaled_X.columns=["Daily_data","Hourly_data","Monthly_data","Pre_year_data"] scaled_Y = scalery.transform(scale_Y) scaled_Y = DataFrame(scaled_Y) scaled_Y.columns=["Label_year_data"] ###adding time sig and cos x = scaled_X.join(df.loc[:,["Hour","Month"]]) def encode(data, col, max_val): data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val) data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val) return data #x = encode(x, 'Hour', 23) x=encode(x,"Month",12) x=x.drop(["Hour"],axis=1) x=x.drop(["Month"],axis=1) #train_x, test_x ,X_val= x[:(len(scaled_X)-n_val-n_test)], x[-n_test:], x[n_val:(n_test+n_val)] train_x, test_x ,X_val= scaled_X[:(len(scaled_X)-n_val-n_test)], scaled_X[-n_test:], scaled_X[n_val:(n_test+n_val)] train_y, test_y,y_val = scaled_Y[:(len(scaled_X)-n_val-n_test)], scaled_Y[-n_test:],scaled_Y[n_val:(n_test+n_val)] print(train_x.shape)
def _extracting_coordinates(dataframe: pd.DataFrame) -> pd.DataFrame: expanded_cols = pd.DataFrame(dataframe['coordenadas'].values.tolist(), columns=['latitude', 'longitude']) return dataframe.join(expanded_cols).drop('coordenadas', axis=1)
def generate_onehot_encoding(data: pd.DataFrame, column_name: str, drop=True): onehot_repr = pd.get_dummies(data[column_name]) data = data.join(onehot_repr) data.drop(column_name, axis=1, inplace=True) return data
columns=['event1', 'event2']) lefth righth pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True) left2 = DataFrame([[1, 2], [3, 4], [5, 6]], index=['a', 'c', 'e'], columns=['Ohio', 'Nevada']) right2 = DataFrame([[7, 8], [9, 10], [11, 12], [13, 14]], index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabma']) right2 left2 pd.merge(left2, right2, how='outer', left_index=True, right_index=True) left2.join(right2, how='outer') #join 메서드는 칼럼이 켭치지 않고 완전히 같거나 유사한 색인구조 통합 left1.join(right1, on='key') another = DataFrame([[7, 8], [9, 10], [11, 12], [16, 17]], index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon']) another left2.join([right2, another]) right2 left2 left2.join([right2, another], how='outer') ''' 합치기전에 고려해야 할 사항 1. 만약 연결하려는 두객체의 색인이 서로 다르다면, 교집합? 합집합 ? 2. 합쳐진 결과에서 합쳐지기전 객체의 데이터를 고려할 수 있음 ? 3. 어떤 축으로 연결할거임? '''
def test_join_inner_multiindex(self): key1 = [ "bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap" ] key2 = [ "two", "one", "three", "one", "two", "one", "two", "two", "three", "one", ] data = np.random.randn(len(key1)) data = DataFrame({"key1": key1, "key2": key2, "data": data}) index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]) joined = data.join(to_join, on=["key1", "key2"], how="inner") expected = merge( data, to_join.reset_index(), left_on=["key1", "key2"], right_on=["first", "second"], how="inner", sort=False, ) expected2 = merge( to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False, ) tm.assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge( to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False, ) expected = expected.drop(["first", "second"], axis=1) expected.index = joined.index assert joined.index.is_monotonic tm.assert_frame_equal(joined, expected)
sqft_get_data = [xx for xx in sqft_cursor] sqft_cursor.close() # Getting city data in dataframe city_data_for_join_df = DataFrame (city_get_data) city_data_for_join_df.columns = city_field_names city_df=DataFrame(city_get_data) city_df.columns = city_field_names # Getting city sqft data in dataframe sqft_df = DataFrame(sqft_get_data) sqft_df.columns = sqft_field_names # joining city and city sqft data frame joined_city_sqft=city_data_for_join_df.join(sqft_df.set_index('CityCode'), on='CityCode') # Transposing the data master_melted_dataset_df=pandas.melt(joined_city_sqft, id_vars=["CityCode","CityName","Metro","County","State","PopulationRank"]) #Question3 print("Question 3") full_average=master_melted_dataset_df["value"].mean() print("Average of Price Sqft Dataset") print(full_average, "\n") print("Maximum of Price Sqft Dataset") print(master_melted_dataset_df["value"].max(), "\n") print("Minimum of Price Sqft Dataset") print(master_melted_dataset_df["value"].min(), "\n") #Question4
def _read_one_data(self, url, params): """ read one data from specified symbol """ symbol = params['symbol'] del params['symbol'] url = url.format(symbol) resp = self._get_response(url, params=params) ptrn = r'root\.App\.main = (.*?);\n}\(this\)\);' try: j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1)) data = j['context']['dispatcher']['stores']['HistoricalPriceStore'] except KeyError: msg = 'No data fetched for symbol {} using {}' raise RemoteDataError(msg.format(symbol, self.__class__.__name__)) # price data prices = DataFrame(data['prices']) prices.columns = [col.capitalize() for col in prices.columns] prices['Date'] = to_datetime( to_datetime(prices['Date'], unit='s').dt.date) if 'Data' in prices.columns: prices = prices[prices['Data'].isnull()] prices = prices[[ 'Date', 'High', 'Low', 'Open', 'Close', 'Volume', 'Adjclose' ]] prices = prices.rename(columns={'Adjclose': 'Adj Close'}) prices = prices.set_index('Date') prices = prices.sort_index().dropna(how='all') if self.ret_index: prices['Ret_Index'] = \ _calc_return_index(prices['Adj Close']) if self.adjust_price: prices = _adjust_prices(prices) # dividends & splits data if self.get_actions and data['eventsData']: actions = DataFrame(data['eventsData']) actions.columns = [col.capitalize() for col in actions.columns] actions['Date'] = to_datetime( to_datetime(actions['Date'], unit='s').dt.date) types = actions['Type'].unique() if 'DIVIDEND' in types: divs = actions[actions.Type == 'DIVIDEND'].copy() divs = divs[['Date', 'Amount']].reset_index(drop=True) divs = divs.set_index('Date') divs = divs.rename(columns={'Amount': 'Dividends'}) prices = prices.join(divs, how='outer') if 'SPLIT' in types: splits = actions[actions.Type == 'SPLIT'].copy() splits['SplitRatio'] = splits['Splitratio'].apply( lambda x: eval(x)) splits = splits.reset_index(drop=True) splits = splits.set_index('Date') splits['Splits'] = 1.0 / splits['SplitRatio'] prices = prices.join(splits['Splits'], how='outer') if 'DIVIDEND' in types and not self.adjust_dividends: # Adjust dividends to deal with splits adj = prices['Splits'].sort_index( ascending=False).fillna(1).cumprod() adj = 1.0 / adj prices['Dividends'] = prices['Dividends'] * adj return prices
def options_to_rates(options, t_min=1. / 12., n_min=6): """ Extract implied risk-free rates and dividend yield from standard European option quote file. ignore data: - with time to maturity < tMin (in fraction of years) - with fewer than nMin quotes per maturity date Parameters ---------- t_min: float (default: 1 month) Minimum time to maturity in fraction of years n_min: int (default: 6) minimum number of quotes per maturity date """ grouped = options.groupby(nm.EXPIRY_DATE) expiry_dates = [] implied_interest_rates = [] implied_dividend_yields = [] for spec, group in grouped: # implied vol for this type/expiry group index = group.index trade_date = group[nm.TRADE_DATE][index[0]] expiry_date = group[nm.EXPIRY_DATE][index[0]] spot = group[nm.SPOT][index[0]] days_to_expiry = (expiry_date - trade_date).days time_to_maturity = days_to_expiry / 365.0 # exclude groups with too short time to maturity if time_to_maturity < t_min: continue # extract the put and call quotes calls = group[group[nm.OPTION_TYPE] == nm.CALL_OPTION] puts = group[group[nm.OPTION_TYPE] == nm.PUT_OPTION] # exclude groups with too few data points if (len(calls) < n_min) | (len(puts) < n_min): continue # calculate forward, implied interest rate and implied div. yield call_premium = DataFrame( (calls[nm.PRICE_BID] + calls[nm.PRICE_ASK]) / 2., columns=[CALL_PREMIUM]) call_premium.index = np.array(calls[nm.STRIKE]) put_premium = DataFrame((puts[nm.PRICE_BID] + puts[nm.PRICE_ASK]) / 2., columns=[PUT_PREMIUM]) put_premium.index = np.array(puts[nm.STRIKE]) # use 'inner' join because some strikes are not quoted for C and P all_quotes = call_premium.join(put_premium, how='inner') all_quotes[nm.STRIKE] = all_quotes.index all_quotes['C-P'] = all_quotes[CALL_PREMIUM] - all_quotes[PUT_PREMIUM] y = np.array(all_quotes['C-P']) x = np.array(all_quotes[nm.STRIKE]) A = np.vstack([x, np.ones(len(x))]).T a_1, a_0 = np.linalg.lstsq(A, y)[0] # intercept is last coef interest_rate = -np.log(-a_1) / time_to_maturity dividend_yield = np.log(spot / a_0) / time_to_maturity implied_interest_rates.append(interest_rate) implied_dividend_yields.append(dividend_yield) expiry_dates.append(expiry_date) rates = ds.riskfree_dividend_template().reindex(index=expiry_dates) rates[nm.INTEREST_RATE] = implied_interest_rates rates[nm.DIVIDEND_YIELD] = implied_dividend_yields return rates
def convert_amenities(df: pd.DataFrame) -> pd.DataFrame: one_hot_df = one_hot_encode_amenities(df) return df.join(one_hot_df).drop(columns="amenities")
XT, BarycenterPredictor, EMDLoss, RndMarginalPredictor, Simulator, X, Y, ) positions = DataFrame({"sensor_id": [0], "x": [10.0], "y": [10.0], "z": [10.0]}) hits = DataFrame({"event_id": [0], "x": [1.0], "y": [1.0], "z": [1.0], "energy": [1.0]}) waveforms = DataFrame({"sensor_id": [0], "event_id": [0], "charge": [20.0]}) ext_waveforms = waveforms.join(positions.set_index("sensor_id"), on="sensor_id") class Test(unittest.TestCase): def test_constructors(self): print(XT(hits)) print(Y(ext_waveforms)) print(RndMarginalPredictor(hits)) def test_simulator(self): sim = Simulator(positions, hits, waveforms) xt, y = sim.sample() print(xt, y) def test_emd_loss(self): loss = EMDLoss()
class TestJoin(object): def setup_method(self, method): # aggregate multiple columns self.df = DataFrame({ 'key1': get_test_data(), 'key2': get_test_data(), 'data1': np.random.randn(N), 'data2': np.random.randn(N) }) # exclude a couple keys for fun self.df = self.df[self.df['key2'] > 1] self.df2 = DataFrame({ 'key1': get_test_data(n=N // 5), 'key2': get_test_data(ngroups=NGROUPS // 2, n=N // 5), 'value': np.random.randn(N // 5) }) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value self.source = DataFrame({ 'MergedA': data['A'], 'MergedD': data['D'] }, index=data['C']) def test_cython_left_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 ls, rs = libjoin.left_outer_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_( [0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) exp_ri = a_( [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_right_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 rs, ls = libjoin.left_outer_join(right, left, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') # 0 1 1 1 exp_li = a_([ 0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, # 2 2 4 6, 7, 8, 6, 7, 8, -1 ]) exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_inner_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) max_group = 5 ls, rs = libjoin.inner_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2') _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') joined_both = merge(self.df, self.df2) _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='left') def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='right') _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') joined_both = merge(self.df, self.df2, how='right') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='right') def test_full_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='outer') _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') joined_both = merge(self.df, self.df2, how='outer') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='outer') def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='inner') _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') joined_both = merge(self.df, self.df2, how='inner') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='inner') def test_handle_overlap(self): joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar']) assert 'key1.foo' in joined assert 'key1.bar' in joined def test_handle_overlap_arbitrary_key(self): joined = merge(self.df, self.df2, left_on='key2', right_on='key1', suffixes=['.foo', '.bar']) assert 'key1.foo' in joined assert 'key2.bar' in joined def test_join_on(self): target = self.target source = self.source merged = target.join(source, on='C') tm.assert_series_equal(merged['MergedA'], target['A'], check_names=False) tm.assert_series_equal(merged['MergedD'], target['D'], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) joined = df.join(df2, on='key') expected = DataFrame({ 'key': ['a', 'a', 'b', 'b', 'c'], 'value': [0, 0, 1, 1, 2] }) assert_frame_equal(joined, expected) # Test when some are missing df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], columns=['one']) df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two']) df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three']) joined = df_a.join(df_b, on='one') joined = joined.join(df_c, on='one') assert np.isnan(joined['two']['c']) assert np.isnan(joined['three']['c']) # merge column not p resent with pytest.raises(KeyError, match="^'E'$"): target.join(source, on='E') # overlap source_copy = source.copy() source_copy['A'] = 0 msg = ("You are trying to merge on float64 and object columns. If" " you wish to proceed you should use pd.concat") with pytest.raises(ValueError, match=msg): target.join(source_copy, on='A') def test_join_on_fails_with_different_right_index(self): df = DataFrame({ 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }) df2 = DataFrame( { 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }, index=tm.makeCustomIndex(10, 2)) msg = (r'len\(left_on\) must equal the number of levels in the index' ' of "right"') with pytest.raises(ValueError, match=msg): merge(df, df2, left_on='a', right_index=True) def test_join_on_fails_with_different_left_index(self): df = DataFrame( { 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }, index=tm.makeCustomIndex(3, 2)) df2 = DataFrame({ 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }) msg = (r'len\(right_on\) must equal the number of levels in the index' ' of "left"') with pytest.raises(ValueError, match=msg): merge(df, df2, right_on='b', left_index=True) def test_join_on_fails_with_different_column_counts(self): df = DataFrame({ 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }) df2 = DataFrame( { 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }, index=tm.makeCustomIndex(10, 2)) msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): merge(df, df2, right_on='a', left_on=['a', 'b']) @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])]) def test_join_on_fails_with_wrong_object_type(self, wrong_type): # GH12081 - original issue # GH21220 - merging of Series and DataFrame is now allowed # Edited test to remove the Series object from test parameters df = DataFrame({'a': [1, 1]}) msg = ("Can only merge Series or DataFrame objects, a {} was passed". format(str(type(wrong_type)))) with pytest.raises(TypeError, match=msg): merge(wrong_type, df, left_on='a', right_on='a') with pytest.raises(TypeError, match=msg): merge(df, wrong_type, left_on='a', right_on='a') def test_join_on_pass_vector(self): expected = self.target.join(self.source, on='C') del expected['C'] join_col = self.target.pop('C') result = self.target.join(self.source, on=join_col) assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on='C') for col in self.source: assert col in merged assert merged[col].isna().all() merged2 = self.target.join(self.source.reindex([]), on='C', how='inner') tm.assert_index_equal(merged2.columns, merged.columns) assert len(merged2) == 0 def test_join_on_inner(self): df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) joined = df.join(df2, on='key', how='inner') expected = df.join(df2, on='key') expected = expected[expected['value'].notna()] tm.assert_series_equal(joined['key'], expected['key'], check_dtype=False) tm.assert_series_equal(joined['value'], expected['value'], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) # corner cases joined = df.join(df2, on=['key']) expected = df.join(df2, on='key') assert_frame_equal(joined, expected) def test_join_on_series(self): result = self.target.join(self.source['MergedA'], on='C') expected = self.target.join(self.source[['MergedA']], on='C') assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 df = DataFrame({'a': [1, 1]}) ds = Series([2], index=[1], name='b') result = df.join(ds, on='a') expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self, join_type): # no overlapping blocks df1 = DataFrame(index=np.arange(10)) df1['bool'] = True df1['string'] = 'foo' df2 = DataFrame(index=np.arange(5, 15)) df2['int'] = 1 df2['float'] = 1. joined = df1.join(df2, how=join_type) expected = _join_by_hand(df1, df2, how=join_type) assert_frame_equal(joined, expected) joined = df2.join(df1, how=join_type) expected = _join_by_hand(df2, df1, how=join_type) assert_frame_equal(joined, expected) def test_join_index_mixed_overlap(self): df1 = DataFrame({ 'A': 1., 'B': 2, 'C': 'foo', 'D': True }, index=np.arange(10), columns=['A', 'B', 'C', 'D']) assert df1['B'].dtype == np.int64 assert df1['D'].dtype == np.bool_ df2 = DataFrame({ 'A': 1., 'B': 2, 'C': 'foo', 'D': True }, index=np.arange(0, 10, 2), columns=['A', 'B', 'C', 'D']) # overlap joined = df1.join(df2, lsuffix='_one', rsuffix='_two') expected_columns = [ 'A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two', 'D_two' ] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) assert_frame_equal(joined, expected) def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() x.join(DataFrame([3], index=[0], columns=['A']), how='outer') def test_join_unconsolidated(self): # GH #331 a = DataFrame(randn(30, 2), columns=['a', 'b']) c = Series(randn(30)) a['c'] = c d = DataFrame(randn(30, 1), columns=['q']) # it works! a.join(d) d.join(a) def test_join_multiindex(self): index1 = MultiIndex.from_arrays( [['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) index2 = MultiIndex.from_arrays( [['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=['var X']) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=['var Y']) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names def test_join_inner_multiindex(self): key1 = [ 'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux', 'snap' ] key2 = [ 'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three', 'one' ] data = np.random.randn(len(key1)) data = DataFrame({'key1': key1, 'key2': key2, 'data': data}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=['j_one', 'j_two', 'j_three']) joined = data.join(to_join, on=['key1', 'key2'], how='inner') expected = merge(data, to_join.reset_index(), left_on=['key1', 'key2'], right_on=['first', 'second'], how='inner', sort=False) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) expected = expected.drop(['first', 'second'], axis=1) expected.index = joined.index assert joined.index.is_monotonic assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.loc[:, expected.columns]) def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) other_df.set_index('a', inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) assert ('b', 'mean') in result assert 'b' in result def test_join_float64_float32(self): a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) joined = a.join(b) assert joined.dtypes['a'] == 'float64' assert joined.dtypes['b'] == 'float64' assert joined.dtypes['c'] == 'float32' a = np.random.randint(0, 5, 100).astype('int64') b = np.random.random(100).astype('float64') c = np.random.random(100).astype('float32') df = DataFrame({'a': a, 'b': b, 'c': c}) xpdf = DataFrame({'a': a, 'b': b, 'c': c}) s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) rs = df.merge(s, left_on='a', right_index=True) assert rs.dtypes['a'] == 'int64' assert rs.dtypes['b'] == 'float64' assert rs.dtypes['c'] == 'float32' assert rs.dtypes['md'] == 'float32' xp = xpdf.merge(s, left_on='a', right_index=True) assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='outer') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() expected = expected[result.columns] expected['a'] = expected.a.astype('int64') expected['b'] = expected.b.astype('int64') assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame({ "a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000] }) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='inner') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') result = result.reset_index() assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name='TEST') inner = df.join(s, how='inner') outer = df.join(s, how='outer') left = df.join(s, how='left') right = df.join(s, how='right') assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right) def test_join_sort(self): left = DataFrame({ 'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 4] }) right = DataFrame({'value2': ['a', 'b', 'c']}, index=['bar', 'baz', 'foo']) joined = left.join(right, on='key', sort=True) expected = DataFrame( { 'key': ['bar', 'baz', 'foo', 'foo'], 'value': [2, 3, 1, 4], 'value2': ['a', 'b', 'c', 'c'] }, index=[1, 2, 0, 3]) assert_frame_equal(joined, expected) # smoke test joined = left.join(right, on='key', sort=False) tm.assert_index_equal(joined.index, pd.Index(lrange(4))) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) expected = DataFrame( { 'a': [1, 2, 3, 3, 4], 'b': [5, np.nan, 6, 7, np.nan] }, index=[1, 2, 3, 3, 'a']) tm.assert_frame_equal(result, expected) df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) expected = DataFrame({ 'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan] }, index=[1, 2, 2, 'a']) tm.assert_frame_equal(result, expected) def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range('2016-01-01', periods=16, freq='M') df = DataFrame([i for i in range(len(index))], index=index, columns=['pnum']) df2 = concat([df, df]) result = df.join(df2, how='inner', rsuffix='_df2') expected = DataFrame(np.tile( np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), columns=['pnum', 'pnum_df2'], index=df2.sort_index().index) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=['a', 'b', 'c', 'd', 'e', 'f']) df.insert(0, 'id', 0) df.insert(5, 'dt', 'foo') grouped = df.groupby('id') mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix='_right') def test_join_many(self): df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) df_list = [ df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9] ] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] expected = reindexed[0].join(reindexed[1:]) tm.assert_frame_equal(result, expected) # different join types joined = df_list[0].join(df_list[1:], how='outer') _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) joined = df_list[0].join(df_list[1:], how='inner') _check_diff_index(df_list, joined, df.index[2:8]) msg = "Joining multiple DataFrames only supported for joining on index" with pytest.raises(ValueError, match=msg): df_list[0].join(df_list[1:], on='a') def test_join_many_mixed(self): df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) df['key'] = ['foo', 'bar'] * 4 df1 = df.loc[:, ['A', 'B']] df2 = df.loc[:, ['C', 'D']] df3 = df.loc[:, ['key']] result = df1.join([df2, df3]) assert_frame_equal(result, df) def test_join_dups(self): # joining dups df = concat([ DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']), DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), columns=['A', 'C']) ], axis=1) expected = concat([df, df], axis=1) result = df.join(df, rsuffix='_2') result.columns = expected.columns assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = [ 'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y' ] assert_frame_equal(dta, expected) def test_join_multi_to_multi(self, join_type): # GH 20475 leftindex = MultiIndex.from_product( [list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num']) left = DataFrame({'v1': range(12)}, index=leftindex) rightindex = MultiIndex.from_product( [list('abc'), list('xy')], names=['abc', 'xy']) right = DataFrame({'v2': [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=['abc', 'xy'], how=join_type) expected = (left.reset_index().merge(right.reset_index(), on=['abc', 'xy'], how=join_type).set_index( ['abc', 'xy', 'num'])) assert_frame_equal(expected, result) msg = (r'len\(left_on\) must equal the number of levels in the index' ' of "right"') with pytest.raises(ValueError, match=msg): left.join(right, on='xy', how=join_type) with pytest.raises(ValueError, match=msg): right.join(left, on=['abc', 'xy'], how=join_type) def test_join_on_tz_aware_datetimeindex(self): # GH 23931 df1 = pd.DataFrame({ 'date': pd.date_range(start='2018-01-01', periods=5, tz='America/Chicago'), 'vals': list('abcde') }) df2 = pd.DataFrame({ 'date': pd.date_range(start='2018-01-03', periods=5, tz='America/Chicago'), 'vals_2': list('tuvwx') }) result = df1.join(df2.set_index('date'), on='date') expected = df1.copy() expected['vals_2'] = pd.Series([np.nan] * len(expected), dtype=object) assert_frame_equal(result, expected)
class TestJoin(tm.TestCase): def setUp(self): # aggregate multiple columns self.df = DataFrame({ 'key1': get_test_data(), 'key2': get_test_data(), 'data1': np.random.randn(N), 'data2': np.random.randn(N) }) # exclude a couple keys for fun self.df = self.df[self.df['key2'] > 1] self.df2 = DataFrame({ 'key1': get_test_data(n=N // 5), 'key2': get_test_data(ngroups=NGROUPS // 2, n=N // 5), 'value': np.random.randn(N // 5) }) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value self.source = DataFrame({ 'MergedA': data['A'], 'MergedD': data['D'] }, index=data['C']) def test_cython_left_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 ls, rs = _join.left_outer_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_( [0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) exp_ri = a_( [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_right_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 rs, ls = _join.left_outer_join(right, left, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') # 0 1 1 1 exp_li = a_([ 0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, # 2 2 4 6, 7, 8, 6, 7, 8, -1 ]) exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_inner_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) max_group = 5 ls, rs = _join.inner_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2') _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') joined_both = merge(self.df, self.df2) _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='left') def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='right') _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') joined_both = merge(self.df, self.df2, how='right') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='right') def test_full_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='outer') _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') joined_both = merge(self.df, self.df2, how='outer') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='outer') def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='inner') _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') joined_both = merge(self.df, self.df2, how='inner') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='inner') def test_handle_overlap(self): joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar']) self.assertIn('key1.foo', joined) self.assertIn('key1.bar', joined) def test_handle_overlap_arbitrary_key(self): joined = merge(self.df, self.df2, left_on='key2', right_on='key1', suffixes=['.foo', '.bar']) self.assertIn('key1.foo', joined) self.assertIn('key2.bar', joined) def test_join_on(self): target = self.target source = self.source merged = target.join(source, on='C') self.assert_series_equal(merged['MergedA'], target['A'], check_names=False) self.assert_series_equal(merged['MergedD'], target['D'], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) joined = df.join(df2, on='key') expected = DataFrame({ 'key': ['a', 'a', 'b', 'b', 'c'], 'value': [0, 0, 1, 1, 2] }) assert_frame_equal(joined, expected) # Test when some are missing df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], columns=['one']) df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two']) df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three']) joined = df_a.join(df_b, on='one') joined = joined.join(df_c, on='one') self.assertTrue(np.isnan(joined['two']['c'])) self.assertTrue(np.isnan(joined['three']['c'])) # merge column not p resent self.assertRaises(KeyError, target.join, source, on='E') # overlap source_copy = source.copy() source_copy['A'] = 0 self.assertRaises(ValueError, target.join, source_copy, on='A') def test_join_on_fails_with_different_right_index(self): with tm.assertRaises(ValueError): df = DataFrame({ 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }) df2 = DataFrame( { 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }, index=tm.makeCustomIndex(10, 2)) merge(df, df2, left_on='a', right_index=True) def test_join_on_fails_with_different_left_index(self): with tm.assertRaises(ValueError): df = DataFrame( { 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }, index=tm.makeCustomIndex(10, 2)) df2 = DataFrame({ 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }) merge(df, df2, right_on='b', left_index=True) def test_join_on_fails_with_different_column_counts(self): with tm.assertRaises(ValueError): df = DataFrame({ 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }) df2 = DataFrame( { 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }, index=tm.makeCustomIndex(10, 2)) merge(df, df2, right_on='a', left_on=['a', 'b']) def test_join_on_fails_with_wrong_object_type(self): # GH12081 wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] df = DataFrame({'a': [1, 1]}) for obj in wrongly_typed: with tm.assertRaisesRegexp(ValueError, str(type(obj))): merge(obj, df, left_on='a', right_on='a') with tm.assertRaisesRegexp(ValueError, str(type(obj))): merge(df, obj, left_on='a', right_on='a') def test_join_on_pass_vector(self): expected = self.target.join(self.source, on='C') del expected['C'] join_col = self.target.pop('C') result = self.target.join(self.source, on=join_col) assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on='C') for col in self.source: self.assertIn(col, merged) self.assertTrue(merged[col].isnull().all()) merged2 = self.target.join(self.source.reindex([]), on='C', how='inner') self.assert_index_equal(merged2.columns, merged.columns) self.assertEqual(len(merged2), 0) def test_join_on_inner(self): df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) joined = df.join(df2, on='key', how='inner') expected = df.join(df2, on='key') expected = expected[expected['value'].notnull()] self.assert_series_equal(joined['key'], expected['key'], check_dtype=False) self.assert_series_equal(joined['value'], expected['value'], check_dtype=False) self.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) # corner cases joined = df.join(df2, on=['key']) expected = df.join(df2, on='key') assert_frame_equal(joined, expected) def test_join_on_series(self): result = self.target.join(self.source['MergedA'], on='C') expected = self.target.join(self.source[['MergedA']], on='C') assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 df = DataFrame({'a': [1, 1]}) ds = Series([2], index=[1], name='b') result = df.join(ds, on='a') expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self): df1 = DataFrame({ 'A': 1., 'B': 2, 'C': 'foo', 'D': True }, index=np.arange(10), columns=['A', 'B', 'C', 'D']) self.assertEqual(df1['B'].dtype, np.int64) self.assertEqual(df1['D'].dtype, np.bool_) df2 = DataFrame({ 'A': 1., 'B': 2, 'C': 'foo', 'D': True }, index=np.arange(0, 10, 2), columns=['A', 'B', 'C', 'D']) # overlap joined = df1.join(df2, lsuffix='_one', rsuffix='_two') expected_columns = [ 'A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two', 'D_two' ] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) assert_frame_equal(joined, expected) # no overlapping blocks df1 = DataFrame(index=np.arange(10)) df1['bool'] = True df1['string'] = 'foo' df2 = DataFrame(index=np.arange(5, 15)) df2['int'] = 1 df2['float'] = 1. for kind in ['inner', 'outer', 'left', 'right']: joined = df1.join(df2, how=kind) expected = _join_by_hand(df1, df2, how=kind) assert_frame_equal(joined, expected) joined = df2.join(df1, how=kind) expected = _join_by_hand(df2, df1, how=kind) assert_frame_equal(joined, expected) def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() x.join(DataFrame([3], index=[0], columns=['A']), how='outer') def test_join_unconsolidated(self): # GH #331 a = DataFrame(randn(30, 2), columns=['a', 'b']) c = Series(randn(30)) a['c'] = c d = DataFrame(randn(30, 1), columns=['q']) # it works! a.join(d) d.join(a) def test_join_multiindex(self): index1 = MultiIndex.from_arrays( [['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) index2 = MultiIndex.from_arrays( [['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=['var X']) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=['var Y']) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') ex_index = index1._tuple_index.union(index2._tuple_index) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) self.assertEqual(joined.index.names, index1.names) df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) ex_index = index1._tuple_index.union(index2._tuple_index) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) self.assertEqual(joined.index.names, index1.names) def test_join_inner_multiindex(self): key1 = [ 'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux', 'snap' ] key2 = [ 'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three', 'one' ] data = np.random.randn(len(key1)) data = DataFrame({'key1': key1, 'key2': key2, 'data': data}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=['j_one', 'j_two', 'j_three']) joined = data.join(to_join, on=['key1', 'key2'], how='inner') expected = merge(data, to_join.reset_index(), left_on=['key1', 'key2'], right_on=['first', 'second'], how='inner', sort=False) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) expected = expected.drop(['first', 'second'], axis=1) expected.index = joined.index self.assertTrue(joined.index.is_monotonic) assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.loc[:, expected.columns]) def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) other_df.set_index('a', inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) self.assertTrue(('b', 'mean') in result) self.assertTrue('b' in result) def test_join_float64_float32(self): a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) joined = a.join(b) self.assertEqual(joined.dtypes['a'], 'float64') self.assertEqual(joined.dtypes['b'], 'float64') self.assertEqual(joined.dtypes['c'], 'float32') a = np.random.randint(0, 5, 100).astype('int64') b = np.random.random(100).astype('float64') c = np.random.random(100).astype('float32') df = DataFrame({'a': a, 'b': b, 'c': c}) xpdf = DataFrame({'a': a, 'b': b, 'c': c}) s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) rs = df.merge(s, left_on='a', right_index=True) self.assertEqual(rs.dtypes['a'], 'int64') self.assertEqual(rs.dtypes['b'], 'float64') self.assertEqual(rs.dtypes['c'], 'float32') self.assertEqual(rs.dtypes['md'], 'float32') xp = xpdf.merge(s, left_on='a', right_index=True) assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='outer') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() expected = expected[result.columns] expected['a'] = expected.a.astype('int64') expected['b'] = expected.b.astype('int64') assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame({ "a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000] }) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='inner') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') result = result.reset_index() assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name='TEST') inner = df.join(s, how='inner') outer = df.join(s, how='outer') left = df.join(s, how='left') right = df.join(s, how='right') assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right) def test_join_sort(self): left = DataFrame({ 'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 4] }) right = DataFrame({'value2': ['a', 'b', 'c']}, index=['bar', 'baz', 'foo']) joined = left.join(right, on='key', sort=True) expected = DataFrame( { 'key': ['bar', 'baz', 'foo', 'foo'], 'value': [2, 3, 1, 4], 'value2': ['a', 'b', 'c', 'c'] }, index=[1, 2, 0, 3]) assert_frame_equal(joined, expected) # smoke test joined = left.join(right, on='key', sort=False) self.assert_index_equal(joined.index, pd.Index(lrange(4))) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) expected = DataFrame( { 'a': [1, 2, 3, 3, 4], 'b': [5, np.nan, 6, 7, np.nan] }, index=[1, 2, 3, 3, 'a']) tm.assert_frame_equal(result, expected) df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) expected = DataFrame({ 'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan] }, index=[1, 2, 2, 'a']) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=['a', 'b', 'c', 'd', 'e', 'f']) df.insert(0, 'id', 0) df.insert(5, 'dt', 'foo') grouped = df.groupby('id') mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix='_right') def test_join_many(self): df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) df_list = [ df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9] ] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] expected = reindexed[0].join(reindexed[1:]) tm.assert_frame_equal(result, expected) # different join types joined = df_list[0].join(df_list[1:], how='outer') _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) joined = df_list[0].join(df_list[1:], how='inner') _check_diff_index(df_list, joined, df.index[2:8]) self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') def test_join_many_mixed(self): df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) df['key'] = ['foo', 'bar'] * 4 df1 = df.loc[:, ['A', 'B']] df2 = df.loc[:, ['C', 'D']] df3 = df.loc[:, ['key']] result = df1.join([df2, df3]) assert_frame_equal(result, df) def test_join_dups(self): # joining dups df = concat([ DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']), DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), columns=['A', 'C']) ], axis=1) expected = concat([df, df], axis=1) result = df.join(df, rsuffix='_2') result.columns = expected.columns assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = [ 'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y' ] assert_frame_equal(dta, expected) def test_panel_join(self): panel = tm.makePanel() tm.add_nans(panel) p1 = panel.iloc[:2, :10, :3] p2 = panel.iloc[2:, 5:, 2:] # left join result = p1.join(p2) expected = p1.copy() expected['ItemC'] = p2['ItemC'] tm.assert_panel_equal(result, expected) # right join result = p1.join(p2, how='right') expected = p2.copy() expected['ItemA'] = p1['ItemA'] expected['ItemB'] = p1['ItemB'] expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) tm.assert_panel_equal(result, expected) # inner join result = p1.join(p2, how='inner') expected = panel.iloc[:, 5:10, 2:3] tm.assert_panel_equal(result, expected) # outer join result = p1.join(p2, how='outer') expected = p1.reindex(major=panel.major_axis, minor=panel.minor_axis) expected = expected.join( p2.reindex(major=panel.major_axis, minor=panel.minor_axis)) tm.assert_panel_equal(result, expected) def test_panel_join_overlap(self): panel = tm.makePanel() tm.add_nans(panel) p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']] p2 = panel.loc[['ItemB', 'ItemC']] # Expected index is # # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1') p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2') no_overlap = panel.loc[['ItemA']] expected = no_overlap.join(p1_suf.join(p2_suf)) tm.assert_panel_equal(joined, expected) def test_panel_join_many(self): tm.K = 10 panel = tm.makePanel() tm.K = 4 panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]] joined = panels[0].join(panels[1:]) tm.assert_panel_equal(joined, panel) panels = [ panel.iloc[:2, :-5], panel.iloc[2:6, 2:], panel.iloc[6:, 5:-7] ] data_dict = {} for p in panels: data_dict.update(p.iteritems()) joined = panels[0].join(panels[1:], how='inner') expected = pd.Panel.from_dict(data_dict, intersect=True) tm.assert_panel_equal(joined, expected) joined = panels[0].join(panels[1:], how='outer') expected = pd.Panel.from_dict(data_dict, intersect=False) tm.assert_panel_equal(joined, expected) # edge cases self.assertRaises(ValueError, panels[0].join, panels[1:], how='outer', lsuffix='foo', rsuffix='bar') self.assertRaises(ValueError, panels[0].join, panels[1:], how='right')
runtime_yf.reset_index(inplace=True) runtime_yf = runtime_yf.rename(columns = {'index':'number of stocks'}) runtime_yf['number of stocks'] += 1 #runtimes using csv files #For this project, we assume that the data is in #the same directory as the .py file. results = [] for i in djia: j = djia.index(i) startTime = perf_counter() filename = "data/"+i + ".csv" df = pd.read_csv(filename, encoding='utf-8') endTime = perf_counter() csv = (endTime - startTime) if j > 1: csv = csv + results[(j - 1)] results.append(csv) runtime_csv = DataFrame(results, columns=['runtime']) runtime_csv.reset_index(inplace=True) runtime_csv = runtime_csv.rename(columns = {'index':'number of stocks'}) runtime_csv['number of stocks'] += 1 runtimes = [] runtimes = runtime_yf.join(runtime_csv, lsuffix='_yf', rsuffix='_csv') runtimes = runtimes.rename(columns ={'number of stocks_yf':'number of stocks'}) runtimes.drop(columns=['number of stocks_csv'])
invalid_times = ['09:31:00', '09:32:00', '09:33:00', '09:34:00'] for i in range(len(gdata)): if str(gdata.index[i])[-8:] in invalid_times: print "Dropping row at index " + str( gdata.index[i]) + ' at ' + time.ctime() gdata.drop(gdata.index[i], inplace=True) ''' Index and join generated image data to clean financial data =========================================================== After getting correct DTI in place, inner join the two DFs on the index ''' test = clean.join(gdata, how='inner') ''' Generate target data for model training ======================================= NB: targets being generated from forward data means we will lose a few train / test examples on the near-term end of the time series ''' # stupidly simple binary loop; flexible to whatever is specified in mins_ahead: ahead = [] for i in range(len(clean) - max(mins_ahead)): current_row = [ 1 if clean.iloc[i + mins_ahead[j], 0] > clean.iloc[i, 0] else 0 for j in range(len(mins_ahead))
class TestJoin: def setup_method(self, method): # aggregate multiple columns self.df = DataFrame({ "key1": get_test_data(), "key2": get_test_data(), "data1": np.random.randn(N), "data2": np.random.randn(N), }) # exclude a couple keys for fun self.df = self.df[self.df["key2"] > 1] self.df2 = DataFrame({ "key1": get_test_data(n=N // 5), "key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5), "value": np.random.randn(N // 5), }) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value self.source = DataFrame({ "MergedA": data["A"], "MergedD": data["D"] }, index=data["C"]) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on="key2") _check_join(self.df, self.df2, joined_key2, ["key2"], how="left") joined_both = merge(self.df, self.df2) _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="left") def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on="key2", how="right") _check_join(self.df, self.df2, joined_key2, ["key2"], how="right") joined_both = merge(self.df, self.df2, how="right") _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="right") def test_full_outer_join(self): joined_key2 = merge(self.df, self.df2, on="key2", how="outer") _check_join(self.df, self.df2, joined_key2, ["key2"], how="outer") joined_both = merge(self.df, self.df2, how="outer") _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="outer") def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on="key2", how="inner") _check_join(self.df, self.df2, joined_key2, ["key2"], how="inner") joined_both = merge(self.df, self.df2, how="inner") _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner") def test_handle_overlap(self): joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar")) assert "key1.foo" in joined assert "key1.bar" in joined def test_handle_overlap_arbitrary_key(self): joined = merge( self.df, self.df2, left_on="key2", right_on="key1", suffixes=(".foo", ".bar"), ) assert "key1.foo" in joined assert "key2.bar" in joined def test_join_on(self): target = self.target source = self.source merged = target.join(source, on="C") tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False) tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({"key": ["a", "a", "b", "b", "c"]}) df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"]) joined = df.join(df2, on="key") expected = DataFrame({ "key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2] }) tm.assert_frame_equal(joined, expected) # Test when some are missing df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"]) df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"]) df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"]) joined = df_a.join(df_b, on="one") joined = joined.join(df_c, on="one") assert np.isnan(joined["two"]["c"]) assert np.isnan(joined["three"]["c"]) # merge column not p resent with pytest.raises(KeyError, match="^'E'$"): target.join(source, on="E") # overlap source_copy = source.copy() source_copy["A"] = 0 msg = ("You are trying to merge on float64 and object columns. If " "you wish to proceed you should use pd.concat") with pytest.raises(ValueError, match=msg): target.join(source_copy, on="A") def test_join_on_fails_with_different_right_index(self): df = DataFrame({ "a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3) }) df2 = DataFrame( { "a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10) }, index=tm.makeCustomIndex(10, 2), ) msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): merge(df, df2, left_on="a", right_index=True) def test_join_on_fails_with_different_left_index(self): df = DataFrame( { "a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3) }, index=tm.makeCustomIndex(3, 2), ) df2 = DataFrame({ "a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10) }) msg = r'len\(right_on\) must equal the number of levels in the index of "left"' with pytest.raises(ValueError, match=msg): merge(df, df2, right_on="b", left_index=True) def test_join_on_fails_with_different_column_counts(self): df = DataFrame({ "a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3) }) df2 = DataFrame( { "a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10) }, index=tm.makeCustomIndex(10, 2), ) msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): merge(df, df2, right_on="a", left_on=["a", "b"]) @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])]) def test_join_on_fails_with_wrong_object_type(self, wrong_type): # GH12081 - original issue # GH21220 - merging of Series and DataFrame is now allowed # Edited test to remove the Series object from test parameters df = DataFrame({"a": [1, 1]}) msg = ("Can only merge Series or DataFrame objects, " f"a {type(wrong_type)} was passed") with pytest.raises(TypeError, match=msg): merge(wrong_type, df, left_on="a", right_on="a") with pytest.raises(TypeError, match=msg): merge(df, wrong_type, left_on="a", right_on="a") def test_join_on_pass_vector(self): expected = self.target.join(self.source, on="C") del expected["C"] join_col = self.target.pop("C") result = self.target.join(self.source, on=join_col) tm.assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on="C") for col in self.source: assert col in merged assert merged[col].isna().all() merged2 = self.target.join(self.source.reindex([]), on="C", how="inner") tm.assert_index_equal(merged2.columns, merged.columns) assert len(merged2) == 0 def test_join_on_inner(self): df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]}) df2 = DataFrame({"value": [0, 1]}, index=["a", "b"]) joined = df.join(df2, on="key", how="inner") expected = df.join(df2, on="key") expected = expected[expected["value"].notna()] tm.assert_series_equal(joined["key"], expected["key"]) tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({"key": ["a", "a", "b", "b", "c"]}) df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"]) # corner cases joined = df.join(df2, on=["key"]) expected = df.join(df2, on="key") tm.assert_frame_equal(joined, expected) def test_join_on_series(self): result = self.target.join(self.source["MergedA"], on="C") expected = self.target.join(self.source[["MergedA"]], on="C") tm.assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 df = DataFrame({"a": [1, 1]}) ds = Series([2], index=[1], name="b") result = df.join(ds, on="a") expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self, join_type): # no overlapping blocks df1 = DataFrame(index=np.arange(10)) df1["bool"] = True df1["string"] = "foo" df2 = DataFrame(index=np.arange(5, 15)) df2["int"] = 1 df2["float"] = 1.0 joined = df1.join(df2, how=join_type) expected = _join_by_hand(df1, df2, how=join_type) tm.assert_frame_equal(joined, expected) joined = df2.join(df1, how=join_type) expected = _join_by_hand(df2, df1, how=join_type) tm.assert_frame_equal(joined, expected) def test_join_index_mixed_overlap(self): df1 = DataFrame( { "A": 1.0, "B": 2, "C": "foo", "D": True }, index=np.arange(10), columns=["A", "B", "C", "D"], ) assert df1["B"].dtype == np.int64 assert df1["D"].dtype == np.bool_ df2 = DataFrame( { "A": 1.0, "B": 2, "C": "foo", "D": True }, index=np.arange(0, 10, 2), columns=["A", "B", "C", "D"], ) # overlap joined = df1.join(df2, lsuffix="_one", rsuffix="_two") expected_columns = [ "A_one", "B_one", "C_one", "D_one", "A_two", "B_two", "C_two", "D_two", ] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) tm.assert_frame_equal(joined, expected) def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() x.join(DataFrame([3], index=[0], columns=["A"]), how="outer") def test_join_unconsolidated(self): # GH #331 a = DataFrame(np.random.randn(30, 2), columns=["a", "b"]) c = Series(np.random.randn(30)) a["c"] = c d = DataFrame(np.random.randn(30, 1), columns=["q"]) # it works! a.join(d) d.join(a) def test_join_multiindex(self): index1 = MultiIndex.from_arrays( [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]], names=["first", "second"], ) index2 = MultiIndex.from_arrays( [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]], names=["first", "second"], ) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"]) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"]) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how="outer") ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names tm.assert_frame_equal(joined, expected) assert joined.index.names == index1.names df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how="outer").sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names tm.assert_frame_equal(joined, expected) assert joined.index.names == index1.names def test_join_inner_multiindex(self): key1 = [ "bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap" ] key2 = [ "two", "one", "three", "one", "two", "one", "two", "two", "three", "one", ] data = np.random.randn(len(key1)) data = DataFrame({"key1": key1, "key2": key2, "data": data}) index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]) joined = data.join(to_join, on=["key1", "key2"], how="inner") expected = merge( data, to_join.reset_index(), left_on=["key1", "key2"], right_on=["first", "second"], how="inner", sort=False, ) expected2 = merge( to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False, ) tm.assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge( to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False, ) expected = expected.drop(["first", "second"], axis=1) expected.index = joined.index assert joined.index.is_monotonic tm.assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.loc[:, expected.columns]) def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"]) new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]}) other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"]) other_df.set_index("a", inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(FutureWarning): result = merge(new_df, other_df, left_index=True, right_index=True) assert ("b", "mean") in result assert "b" in result def test_join_float64_float32(self): a = DataFrame(np.random.randn(10, 2), columns=["a", "b"], dtype=np.float64) b = DataFrame(np.random.randn(10, 1), columns=["c"], dtype=np.float32) joined = a.join(b) assert joined.dtypes["a"] == "float64" assert joined.dtypes["b"] == "float64" assert joined.dtypes["c"] == "float32" a = np.random.randint(0, 5, 100).astype("int64") b = np.random.random(100).astype("float64") c = np.random.random(100).astype("float32") df = DataFrame({"a": a, "b": b, "c": c}) xpdf = DataFrame({"a": a, "b": b, "c": c}) s = DataFrame(np.random.random(5).astype("float32"), columns=["md"]) rs = df.merge(s, left_on="a", right_index=True) assert rs.dtypes["a"] == "int64" assert rs.dtypes["b"] == "float64" assert rs.dtypes["c"] == "float32" assert rs.dtypes["md"] == "float32" xp = xpdf.merge(s, left_on="a", right_index=True) tm.assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how="outer") df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer") expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer") result = result.reset_index() expected = expected[result.columns] expected["a"] = expected.a.astype("int64") expected["b"] = expected.b.astype("int64") tm.assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame({ "a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000] }) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how="inner") df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner") expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner") result = result.reset_index() tm.assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 df = DataFrame({ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], "B": ["one", "one", "two", "three", "two", "two", "one", "three"], "C": np.random.randn(8), "D": np.random.randn(8), }) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST") inner = df.join(s, how="inner") outer = df.join(s, how="outer") left = df.join(s, how="left") right = df.join(s, how="right") tm.assert_frame_equal(inner, outer) tm.assert_frame_equal(inner, left) tm.assert_frame_equal(inner, right) def test_join_sort(self): left = DataFrame({ "key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4] }) right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) joined = left.join(right, on="key", sort=True) expected = DataFrame( { "key": ["bar", "baz", "foo", "foo"], "value": [2, 3, 1, 4], "value2": ["a", "b", "c", "c"], }, index=[1, 2, 0, 3], ) tm.assert_frame_equal(joined, expected) # smoke test joined = left.join(right, on="key", sort=False) tm.assert_index_equal(joined.index, Index(range(4)), exact=True) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"]) df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) expected = DataFrame( { "a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan] }, index=[1, 2, 3, 3, "a"], ) tm.assert_frame_equal(result, expected) df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"]) df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) expected = DataFrame({ "a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan] }, index=[1, 2, 2, "a"]) tm.assert_frame_equal(result, expected) def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range("2016-01-01", periods=16, freq="M") df = DataFrame(list(range(len(index))), index=index, columns=["pnum"]) df2 = concat([df, df]) result = df.join(df2, how="inner", rsuffix="_df2") expected = DataFrame( np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), columns=["pnum", "pnum_df2"], index=df2.sort_index().index, ) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"]) df.insert(0, "id", 0) df.insert(5, "dt", "foo") grouped = df.groupby("id") mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix="_right") def test_join_many(self): df = DataFrame(np.random.randn(10, 6), columns=list("abcdef")) df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) df_list = [ df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9] ] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] expected = reindexed[0].join(reindexed[1:]) tm.assert_frame_equal(result, expected) # different join types joined = df_list[0].join(df_list[1:], how="outer") _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) joined = df_list[0].join(df_list[1:], how="inner") _check_diff_index(df_list, joined, df.index[2:8]) msg = "Joining multiple DataFrames only supported for joining on index" with pytest.raises(ValueError, match=msg): df_list[0].join(df_list[1:], on="a") def test_join_many_mixed(self): df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"]) df["key"] = ["foo", "bar"] * 4 df1 = df.loc[:, ["A", "B"]] df2 = df.loc[:, ["C", "D"]] df3 = df.loc[:, ["key"]] result = df1.join([df2, df3]) tm.assert_frame_equal(result, df) def test_join_dups(self): # joining dups df = concat( [ DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B" ]), DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]), ], axis=1, ) expected = concat([df, df], axis=1) result = df.join(df, rsuffix="_2") result.columns = expected.columns tm.assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = [ "x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y" ] tm.assert_frame_equal(dta, expected) def test_join_multi_to_multi(self, join_type): # GH 20475 leftindex = MultiIndex.from_product( [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"]) left = DataFrame({"v1": range(12)}, index=leftindex) rightindex = MultiIndex.from_product( [list("abc"), list("xy")], names=["abc", "xy"]) right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=["abc", "xy"], how=join_type) expected = (left.reset_index().merge(right.reset_index(), on=["abc", "xy"], how=join_type).set_index( ["abc", "xy", "num"])) tm.assert_frame_equal(expected, result) msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): left.join(right, on="xy", how=join_type) with pytest.raises(ValueError, match=msg): right.join(left, on=["abc", "xy"], how=join_type) def test_join_on_tz_aware_datetimeindex(self): # GH 23931, 26335 df1 = DataFrame({ "date": pd.date_range(start="2018-01-01", periods=5, tz="America/Chicago"), "vals": list("abcde"), }) df2 = DataFrame({ "date": pd.date_range(start="2018-01-03", periods=5, tz="America/Chicago"), "vals_2": list("tuvwx"), }) result = df1.join(df2.set_index("date"), on="date") expected = df1.copy() expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object) tm.assert_frame_equal(result, expected) def test_join_datetime_string(self): # GH 5647 dfa = DataFrame( [ ["2012-08-02", "L", 10], ["2012-08-02", "J", 15], ["2013-04-06", "L", 20], ["2013-04-06", "J", 25], ], columns=["x", "y", "a"], ) dfa["x"] = pd.to_datetime(dfa["x"]) dfb = DataFrame( [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]], columns=["x", "y", "z"], index=[2, 4], ) dfb["x"] = pd.to_datetime(dfb["x"]) result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"]) expected = DataFrame( [ [Timestamp("2012-08-02 00:00:00"), "J", 1, 15], [Timestamp("2013-04-06 00:00:00"), "L", 2, 20], ], index=[2, 4], columns=["x", "y", "z", "a"], ) tm.assert_frame_equal(result, expected)
df = read_csv("Phoneix_Finalclean.csv") aa=df.Conditions_Name.value_counts() ax=aa.plot(x='Conditions_Name', y='Amount',kind='bar',color="blue", figsize=(15,8),fontsize=16) #plt.title("Twelve years' weather condition summary",size=30) #ax.set_title("2004-2016 Phoenix weather condition summary",size=30) ax.set_xlabel('Weather Condition',size=20) ax.set_ylabel('Total Amount/hour',size=20) plt.show() scaler = MinMaxScaler(feature_range=(0, 1)) scaled_d =df.loc[:,["Sea_Level_PressureIn_N","Humidity_N","Dew_PointF_N","Wind_Speed_mps","Temperature_C_N"]] scaled = scaler.fit_transform(scaled_d) scaled = DataFrame(scaled) scaled.columns = ["Sea_Level_PressureIn_N","Humidity_N","Dew_PointF_N","Wind_Speed_mps","Temperature_C_N"] x = scaled.join(df.loc[:,["Hour","Conditions_Name"]]) x = x.loc[x['Conditions_Name'].isin(["Clear","Mostly Cloudy","Partly Cloudy","Scattered Clouds",'Overcast'])] x = x.dropna() x.isnull().sum() count = x.Conditions_Name.value_counts() print(count) def encode(data, col, max_val): data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val) data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val) return data x = encode(x, 'Hour', 23) x=x.drop(["Hour"],axis=1) ax = x.plot.scatter('Hour_sin', 'Hour_cos').set_aspect('equal') """ def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
def train_and_eval_classifier(dataframe: pd.DataFrame, label_name: str, train_fraction: float, model_name: str, seed: int, verbose: int = 1, n_jobs: int = 4) -> Tuple[pd.DataFrame, Dict]: """ Train and evaluate the classifier given dataset as a dataframe. The dataset is a design matrix, where in rows each new observations are placed, and the columns denote the explanatory variables. The process of finding the best parameters is done by leave one out cross validation method, utilizing the accuracy score. :param dataframe: Data collected for the classification problem. :param label_name: Name of the label i.e., the dependent variable. :param train_fraction: Fraction of samples for each class for stratified sampling. :param model_name: Name of the utilized model. :param seed: Seed used for reproduction of the experiment results. :param verbose: Verbosity mode. :param n_jobs: Number of jobs utilized for the parallel computing. :return: Tuple of the report over the test set as a dataframe and the best parameters found as a dictionary. """ dataframe = dataframe.join( pd.get_dummies(dataframe[label_name], prefix='class')) class_names = [col_name for col_name in dataframe if col_name.startswith('class')] train = dataframe.groupby('label', group_keys=False).apply( lambda class_group: class_group.sample( n=ceil(train_fraction * len(class_group)), random_state=seed)).drop(columns=label_name) test = dataframe.drop(train.index).drop(columns=label_name) X_train, y_train, X_test, y_test = \ train.drop(columns=class_names), train[class_names], \ test.drop(columns=class_names), test[class_names] model = GridSearchCV( estimator=ML_MODELS[model_name](random_state=seed), param_grid=ML_MODELS_GRID[model_name], cv=LeaveOneOut().split(X_train, y_train), scoring=make_scorer(accuracy_score), verbose=verbose, n_jobs=n_jobs, refit=True).fit(X_train, y_train) y_test_pred = model.predict(X_test) y_test_true_argmax = y_test.values.argmax(axis=1) y_test_pred_argmax = y_test_pred.argmax(axis=1) class_names = {class_name: i for i, class_name in enumerate(list(y_test))} test_report = pd.DataFrame( confusion_matrix(y_true=y_test_true_argmax, y_pred=y_test_pred_argmax, labels=list(class_names.values())), index=['true_' + class_name for class_name in class_names.keys()], columns=['pred_' + class_name for class_name in class_names.keys()]) placeholder = [None for _ in range(len(class_names) - 1)] test_report['test_oa_acc'] = [accuracy_score( y_true=y_test_true_argmax, y_pred=y_test_pred_argmax)] + placeholder test_report['test_avg_acc'] = [balanced_accuracy_score( y_true=y_test_true_argmax, y_pred=y_test_pred_argmax)] + placeholder return test_report, model.best_params_
def filtered_summaries( self, start_time, end_time, interval, filter_expression, summary_types, calculation_basis=None, filter_evaluation=None, filter_interval=None, time_type=None, ): """filtered_summaries Return one or more summary values for each interval within a time range Args: start_time (str): String containing the date, and possibly time, from which to retrieve the values. This is parsed, together with `end_time`, using :afsdk:`AF.Time.AFTimeRange <M_OSIsoft_AF_Time_AFTimeRange__ctor_1.htm>`. end_time (str): String containing the date, and possibly time, until which to retrieve values. This is parsed, together with `start_time`, using :afsdk:`AF.Time.AFTimeRange <M_OSIsoft_AF_Time_AFTimeRange__ctor_1.htm>`. interval (str): String containing the interval at which to extract data. This is parsed using :afsdk:`AF.Time.AFTimeSpan.Parse <M_OSIsoft_AF_Time_AFTimeSpan_Parse_1.htm>`. filter_expression (str, optional): Defaults to ''. Query on which data to include in the results. See :ref:`filtering_values` for more information on filter queries. summary_types (int or PIConsts.SummaryType): Type(s) of summaries of the data within the requested time range. calculation_basis (int or PIConsts.CalculationBasis, optional): Event weighting within an interval. See :ref:`event_weighting` and :any:`CalculationBasis` for more information. Defaults to CalculationBasis.TIME_WEIGHTED. filter_evaluation (int or PIConsts,ExpressionSampleType, optional): Determines whether the filter is applied to the raw events in the database, of if it is applied to an interpolated series with a regular interval. Defaults to ExpressionSampleType.EXPRESSION_RECORDED_VALUES. filter_interval (str, optional): String containing the interval at which to extract apply the filter. This is parsed using :afsdk:`AF.Time.AFTimeSpan.Parse <M_OSIsoft_AF_Time_AFTimeSpan_Parse_1.htm>`. time_type (int or PIConsts.TimestampCalculation, optional): Timestamp to return for each of the requested summaries. See :ref:`summary_timestamps` and :any:`TimestampCalculation` for more information. Defaults to TimestampCalculation.AUTO. Returns: pandas.DataFrame: Dataframe with the unique timestamps as row index and the summary name as column name. """ time_range = AF.Time.AFTimeRange(start_time, end_time) interval = AF.Time.AFTimeSpan.Parse(interval) filter_expression = self._normalize_filter_expression( filter_expression) calculation_basis = get_enumerated_value( enumeration=CalculationBasis, value=calculation_basis, default=CalculationBasis.TIME_WEIGHTED, ) filter_evaluation = get_enumerated_value( enumeration=ExpressionSampleType, value=filter_evaluation, default=ExpressionSampleType.EXPRESSION_RECORDED_VALUES, ) time_type = get_enumerated_value( enumeration=TimestampCalculation, value=time_type, default=TimestampCalculation.AUTO, ) filter_interval = AF.Time.AFTimeSpan.Parse(filter_interval) pivalues = self._filtered_summaries( time_range, interval, filter_expression, summary_types, calculation_basis, filter_evaluation, filter_interval, time_type, ) df = DataFrame() for summary in pivalues: key = SummaryType(summary.Key).name timestamps, values = zip( *[(PISeries.timestamp_to_index(value.Timestamp.UtcTime), value.Value) for value in summary.Value]) df = df.join(DataFrame(data={key: values}, index=timestamps), how="outer") return df
i = [t in dfs1[2].时间.values for t in dfs1[0].时间.values] dfs1[0] = dfs1[0][i] dfs1[1] = dfs1[1][i] i = [t in dfs1[0].时间.values for t in dfs1[2].时间.values] for j in range(2, 15): dfs1[j] = dfs1[j][i] for j in range(len(dfs1)): dfs1[j] = dfs1[j].set_index("时间") #dfs1[0] = dfs1[0][t in dfs1[2].时间.values for t in dfs1[0].时间.values] tr_data = dfs1[0].iloc[:, 0].apply(float) tr_data = DataFrame(tr_data) for i in range(1, len(dfs1)): tr_data = tr_data.join(dfs1[i].iloc[:, 0].apply(float)) tr_data1 = dfs1[0].iloc[:, 1].apply(float) tr_data1 = DataFrame(tr_data1) for i in range(1, len(dfs1)): tr_data1 = tr_data1.join(dfs1[i].iloc[:, 1].apply(float)) tr_data2 = dfs1[0].iloc[:, 2].apply(float) tr_data2 = DataFrame(tr_data2) for i in range(1, len(dfs1)): tr_data2 = tr_data2.join(dfs1[i].iloc[:, 2].apply(float)) corMat = DataFrame(tr_data2.corr()) plot.pcolor(corMat) plot.show()
def get_forward_data(self, months, call=True, put=False): """ Gets either call, put, or both data for months starting in the current month and going out in the future a spcified amount of time. Parameters ---------- months: number, int How many months to go out in the collection of the data. This is inclusive. call: bool, optional (default=True) Whether or not to collect data for call options put: bool, optional (default=False) Whether or not to collect data for put options. Returns ------- all_calls: DataFrame If asked for, a DataFrame containing call data from the current month to the current month plus months. all_puts: DataFrame If asked for, a DataFrame containing put data from the current month to the current month plus months. """ in_months = range(cur_month, cur_month + months + 1) in_years = [cur_year] * months # Figure out how many items in in_months go past 12 to_change = 0 for i in range(months): if in_months[i] > 12: in_months[i] -= 12 to_change += 1 # Change the corresponding items in the in_years list. for i in range(1, to_change + 1): in_years[-i] += 1 if call: all_calls = DataFrame() for mon in range(months): try: # This catches cases when there isn't data for a month call_frame = self.get_call_data(in_months[mon], in_years[mon]) tick = str(call_frame.ix[0, 1]) start = len(self.symbol) year = tick[start:start + 2] month = tick[start + 2:start + 4] day = tick[start + 4:start + 6] expiry = str(month + '-' + day + '-' + year) call_frame['Expiry'] = expiry if mon == 0: all_calls = all_calls.join(call_frame, how='right') else: all_calls = concat([all_calls, call_frame]) except: pass if put: all_puts = DataFrame() for mon in range(months): try: # This catches cases when there isn't data for a month put_frame = self.get_put_data(in_months[mon], in_years[mon]) # Add column with expiry data to this frame. tick = str(put_frame.ix[0, 1]) start = len(self.symbol) year = tick[start:start + 2] month = tick[start + 2:start + 4] day = tick[start + 4:start + 6] expiry = str(month + '-' + day + '-' + year) put_frame['Expiry'] = expiry if mon == 0: all_puts = all_puts.join(put_frame, how='right') else: all_puts = concat([all_puts, put_frame]) except: pass if call and put: return [all_calls, all_puts] else: if call: return all_calls else: return all_puts
def _read_one_data(self, url, params): """ read one data from specified symbol """ symbol = params["symbol"] del params["symbol"] url = url.format(symbol) resp = self._get_response(url, params=params) ptrn = r"root\.App\.main = (.*?);\n}\(this\)\);" try: j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1)) data = j["context"]["dispatcher"]["stores"]["HistoricalPriceStore"] except KeyError: msg = "No data fetched for symbol {} using {}" raise RemoteDataError(msg.format(symbol, self.__class__.__name__)) # price data prices = DataFrame(data["prices"]) prices.columns = [col.capitalize() for col in prices.columns] prices["Date"] = to_datetime( to_datetime(prices["Date"], unit="s").dt.date) if "Data" in prices.columns: prices = prices[prices["Data"].isnull()] prices = prices[[ "Date", "High", "Low", "Open", "Close", "Volume", "Adjclose" ]] prices = prices.rename(columns={"Adjclose": "Adj Close"}) prices = prices.set_index("Date") prices = prices.sort_index().dropna(how="all") if self.ret_index: prices["Ret_Index"] = _calc_return_index(prices["Adj Close"]) if self.adjust_price: prices = _adjust_prices(prices) # dividends & splits data if self.get_actions and data["eventsData"]: actions = DataFrame(data["eventsData"]) actions.columns = [col.capitalize() for col in actions.columns] actions["Date"] = to_datetime( to_datetime(actions["Date"], unit="s").dt.date) types = actions["Type"].unique() if "DIVIDEND" in types: divs = actions[actions.Type == "DIVIDEND"].copy() divs = divs[["Date", "Amount"]].reset_index(drop=True) divs = divs.set_index("Date") divs = divs.rename(columns={"Amount": "Dividends"}) prices = prices.join(divs, how="outer") if "SPLIT" in types: def split_ratio(row): if float(row["Numerator"]) > 0: if ":" in row["Splitratio"]: n, m = row["Splitratio"].split(':') return float(m) / float(n) else: return eval(row["Splitratio"]) else: return 1 splits = actions[actions.Type == "SPLIT"].copy() splits["SplitRatio"] = splits.apply(split_ratio, axis=1) splits = splits.reset_index(drop=True) splits = splits.set_index("Date") splits["Splits"] = splits["SplitRatio"] prices = prices.join(splits["Splits"], how="outer") if "DIVIDEND" in types and not self.adjust_dividends: # dividends are adjusted automatically by Yahoo adj = (prices["Splits"].sort_index( ascending=False).fillna(1).cumprod()) prices["Dividends"] = prices["Dividends"] / adj return prices
def data_encoding(self, raw_data: pd.DataFrame, building_num: int, gates_code_table: dict) -> np.array: """ Encode raw record data from database :param gates_code_table: gate code table for mapping code :param building_num: total buildings :param raw_data: raw data from DataTable.get_raw_record_data() :return: - data_list : Feature of encode data - target_list : gate label of encode data """ week_data = raw_data['datetime'].dt.weekday.rename('week') raw_data = raw_data.join(week_data) raw_data = raw_data.reset_index().drop(columns=['index']) data_list = pd.DataFrame() ############################# # Feature Encoding # ############################# # gate one hot encoding gate_one_hot_list = np.arange(len(gates_code_table)).reshape(-1, 1) gate_encoder = OneHotEncoder() gate_encoder.fit(gate_one_hot_list) week_one_hot_list = np.arange(7).reshape(-1, 1) week_encoder = OneHotEncoder() week_encoder.fit(week_one_hot_list) building_one_hot_list = np.arange(1, building_num + 1).reshape(-1, 1) building_encoder = OneHotEncoder() building_encoder.fit(building_one_hot_list) gatecode = raw_data['building'].str.cat([raw_data['floor'], raw_data['IO']], sep='-').apply( lambda x: gates_code_table[x] if x in gates_code_table else 0).rename('gate').astype(int) raw_data['gate'] = gatecode raw_data['next_gate'] = gatecode.shift(-1) gatecode = raw_data['gate'] gatecode_onehotcode = gate_encoder.transform(gatecode.values.reshape(-1, 1)).toarray() gatecode_onehotcode = pd.DataFrame(gatecode_onehotcode, dtype='int').add_prefix('gate_') # weekday one hot encoding weekdaycode = week_encoder.transform(raw_data['week'].values.reshape(-1, 1)).toarray() weekdaycode = pd.DataFrame(weekdaycode, dtype='int').add_prefix('weekday_') # building one hot encoding buildingcode = raw_data['building'].astype(int) buildingcode_onehotcode = building_encoder.transform(buildingcode.values.reshape(-1, 1)).toarray() buildingcode_onehotcode = pd.DataFrame(buildingcode_onehotcode, dtype='int').add_prefix('building_') # Time feature data_list['hour'] = raw_data['datetime'].apply(lambda x: x.hour / 24) data_list['minute'] = raw_data['datetime'].apply(lambda x: x.minute / 60) data_list['second'] = raw_data['datetime'].apply(lambda x: x.second / 60) # IO code IOcode = raw_data['IO'].apply(lambda x: convert_IOcode(x)) # join feature data_list = data_list.join(other=[IOcode, weekdaycode, buildingcode_onehotcode, gatecode_onehotcode]) # match order data_list = data_list.dropna(how='any') target_list = raw_data['next_gate'] data_list = data_list.values target_list = target_list.values.flatten() return data_list, target_list
def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() x.join(DataFrame([3], index=[0], columns=['A']), how='outer')
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame: df_encoded = self.transform(df) return df.join(df_encoded)
def align_srt_tshark_stats(stats: pd.DataFrame, rcv_tshark_csv: str): """ Align SRT statistics and tshark data. Attributes: stats: Aligned SRT statisitcs collected both at the receiver and sender sides, the output from align_srt_stats function. rcv_tshark_csv: Filepath to .csv thark data collected at the receiver side. """ print('\nMerging tshark data with SRT statistics') # Extract SRT packets from .csv tshark dump file collected at the receiver side srt_packets = extract_srt_packets(rcv_tshark_csv) print('\nSRT packets extracted from receiver tshark dump') print(srt_packets.head(10)) # Extract UMSG_ACK packets from SRT packets srt_packets that # contain receiving speed and bandwidth estimations reported by # receiver each 10 ms umsg_ack_packets = extract_umsg_ack_packets(srt_packets) print('\nUMSG_ACK packets extracted from SRT packets') print(umsg_ack_packets.head(10)) # From umsg_ack_packets dataframe, extract features valuable # for further analysis, do some data cleaning and timezone correction TSHARK_FEATURES = [ 'ws.no', 'frame.time', 'srt.rtt', 'srt.rttvar', 'srt.rate', 'srt.bw', 'srt.rcvrate' ] umsg_ack_packets = umsg_ack_packets[TSHARK_FEATURES] umsg_ack_packets = umsg_ack_packets.set_index('frame.time') umsg_ack_packets.index = umsg_ack_packets.index.tz_convert(None) umsg_ack_packets['srt.rtt'] = umsg_ack_packets['srt.rtt'] / 1000 umsg_ack_packets['srt.rttvar'] = umsg_ack_packets['srt.rttvar'] / 1000 umsg_ack_packets = umsg_ack_packets.rename( columns={ 'srt.rtt': 'srt.rtt.ms', 'srt.rttvar': 'srt.rttvar.ms', 'srt.rate': 'srt.rate.pkts', 'srt.bw': 'srt.bw.pkts', 'srt.rcvrate': 'srt.rate.Bps' } ) umsg_ack_packets['srt.rate.Mbps'] = convert_bytesps_in_mbps( umsg_ack_packets['srt.rate.Bps'] ) umsg_ack_packets['srt.bw.Mbps'] = convert_bytesps_in_mbps( convert_pktsps_in_bytesps(umsg_ack_packets['srt.bw.pkts']) ) umsg_ack_packets = umsg_ack_packets[ [ 'ws.no', 'srt.rtt.ms', 'srt.rttvar.ms', 'srt.rate.pkts', 'srt.rate.Mbps', 'srt.bw.pkts', 'srt.bw.Mbps' ] ] print('\nAdjusted UMSG_ACK packets') print(umsg_ack_packets.head(10)) print(umsg_ack_packets.tail(10)) # Combine stats dataframe (with SRT statistics) and adjusted # umsg_ack_packets dataframe. stats dataframe timepoints will be # further used as the timepoints for result dataframe start_timestamp = stats.index[0] end_timestamp = stats.index[-1] stats['isStats'] = True cols = ['srt.rtt.ms', 'srt.rttvar.ms', 'srt.rate.Mbps', 'srt.bw.Mbps'] df = stats.join(umsg_ack_packets[cols].add_suffix('_tshark'), how='outer') df['isStats'] = df['isStats'].fillna(False) df = df[(df.index >= start_timestamp) & (df.index <= end_timestamp)] assert(df['isStats'][0] == True) assert(df['isStats'][-1] == True) print('\nJoined SRT stats and tshark statistics') print(df.head(10)) print(df.tail(10)) # Do interpolation cols_to_interpolate = [f'{col}_tshark' for col in cols] df.loc[:, cols_to_interpolate] = df.interpolate().fillna(method='bfill') df.loc[:, cols_to_interpolate] = df.round(2) print('\nInterpolated tshark statistics') print(df.head(10)) print(df.tail(10)) # Extract only stats dataframe timepoints (aligned SRT stats timepoints) df = df.loc[df['isStats'], df.columns != 'isStats'] cols_to_int = [ 'pktSent_snd', 'pktSndLoss_snd', 'pktRecv_rcv', 'pktRcvLoss_rcv', ] # TODO: Does not work # df.loc[:, cols_to_int] = df.astype('int32') for col in cols_to_int: df[col] = df[col].astype('int32') print('\nOnly SRT stats timepoints') print(df.head(10)) print(df.tail(10)) # Rearrange the columns cols_rearranged = [ 'pktSent_snd', 'pktRecv_rcv', 'pktSndLoss_snd', 'pktRcvLoss_rcv', 'msRTT_snd', 'msRTT_rcv', 'srt.rtt.ms_tshark', 'srt.rttvar.ms_tshark', 'mbpsBandwidth_snd', 'mbpsBandwidth_rcv', 'srt.bw.Mbps_tshark', # 'srt.rate.Mbps_tshark' ] df = df[cols_rearranged] return df
def transform(self, df: pd.DataFrame) -> pd.DataFrame: df_encoded = self._internal_encoder.transform(df[self.name]) df_encoded = df_encoded.drop(columns=['intercept'], errors='ignore') df_encoded = self.update_column_names(df_encoded) return df.join(df_encoded)
def test_left_join_index_multi_match_multiindex(self): left = DataFrame( [ ["X", "Y", "C", "a"], ["W", "Y", "C", "e"], ["V", "Q", "A", "h"], ["V", "R", "D", "i"], ["X", "Y", "D", "b"], ["X", "Y", "A", "c"], ["W", "Q", "B", "f"], ["W", "R", "C", "g"], ["V", "Y", "C", "j"], ["X", "Y", "B", "d"], ], columns=["cola", "colb", "colc", "tag"], index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8], ) right = DataFrame( [ ["W", "R", "C", 0], ["W", "Q", "B", 3], ["W", "Q", "B", 8], ["X", "Y", "A", 1], ["X", "Y", "A", 4], ["X", "Y", "B", 5], ["X", "Y", "C", 6], ["X", "Y", "C", 9], ["X", "Q", "C", -6], ["X", "R", "C", -9], ["V", "Y", "C", 7], ["V", "R", "D", 2], ["V", "R", "D", -1], ["V", "Q", "A", -3], ], columns=["col1", "col2", "col3", "val"], ).set_index(["col1", "col2", "col3"]) result = left.join(right, on=["cola", "colb", "colc"], how="left") expected = DataFrame( [ ["X", "Y", "C", "a", 6], ["X", "Y", "C", "a", 9], ["W", "Y", "C", "e", np.nan], ["V", "Q", "A", "h", -3], ["V", "R", "D", "i", 2], ["V", "R", "D", "i", -1], ["X", "Y", "D", "b", np.nan], ["X", "Y", "A", "c", 1], ["X", "Y", "A", "c", 4], ["W", "Q", "B", "f", 3], ["W", "Q", "B", "f", 8], ["W", "R", "C", "g", 0], ["V", "Y", "C", "j", 7], ["X", "Y", "B", "d", 5], ], columns=["cola", "colb", "colc", "tag", "val"], index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8], ) tm.assert_frame_equal(result, expected) result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True) expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort") tm.assert_frame_equal(result, expected)
def test_join_multi_levels(self): # GH 3662 # merge multi-levels household = DataFrame( dict( household_id=[1, 2, 3], male=[0, 1, 0], wealth=[196087.3, 316478.7, 294750], ), columns=["household_id", "male", "wealth"], ).set_index("household_id") portfolio = DataFrame( dict( household_id=[1, 2, 2, 3, 3, 3, 4], asset_id=[ "nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", np.nan, ], name=[ "ABN Amro", "Robeco", "Royal Dutch Shell", "Royal Dutch Shell", "AAB Eastern Europe Equity Fund", "Postbank BioTech Fonds", np.nan, ], share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], ), columns=["household_id", "asset_id", "name", "share"], ).set_index(["household_id", "asset_id"]) result = household.join(portfolio, how="inner") expected = (DataFrame( dict( male=[0, 1, 1, 0, 0, 0], wealth=[ 196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0 ], name=[ "ABN Amro", "Robeco", "Royal Dutch Shell", "Royal Dutch Shell", "AAB Eastern Europe Equity Fund", "Postbank BioTech Fonds", ], share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], household_id=[1, 2, 2, 3, 3, 3], asset_id=[ "nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", ], )).set_index([ "household_id", "asset_id" ]).reindex(columns=["male", "wealth", "name", "share"])) tm.assert_frame_equal(result, expected) # equivalency result = merge( household.reset_index(), portfolio.reset_index(), on=["household_id"], how="inner", ).set_index(["household_id", "asset_id"]) tm.assert_frame_equal(result, expected) result = household.join(portfolio, how="outer") expected = concat( [ expected, (DataFrame( dict(share=[1.00]), index=MultiIndex.from_tuples( [(4, np.nan)], names=["household_id", "asset_id"]), )), ], axis=0, sort=True, ).reindex(columns=expected.columns) tm.assert_frame_equal(result, expected) # invalid cases household.index.name = "foo" with pytest.raises( ValueError, match="cannot join with no overlapping index names"): household.join(portfolio, how="inner") portfolio2 = portfolio.copy() portfolio2.index.set_names(["household_id", "foo"]) with pytest.raises(ValueError, match="columns overlap but no suffix specified"): portfolio2.join(portfolio, how="inner")
def add_dummies(data: pd.DataFrame, column: str): ohe = pd.get_dummies(data[column]).add_prefix(f'{column}_') data = data.drop(column, axis=1) data = data.join(ohe) return data
def insert_timeseries(self, df: pd.DataFrame, columns: list, timeseries: dict, interpolate=None, plot=False, title=None, columns_i: list=None, minType=None): if not interpolate: interpolate = self.INTERPOLATE i_date = copy.copy(self.START_DATE) td = timedelta(hours=1) if self.TIMESTEP == "hourly" else timedelta(days=1) # hourly or daily data = [] c = len(columns_i) if columns_i else len(columns) missing_data = [np.nan for i in range(0, c)] while i_date <= self.END_DATE: values = [] datestamp = i_date.strftime("%Y-%m-%d %H") if datestamp in timeseries.keys(): if columns_i: for i in columns_i: if minType: v = datetime.strptime(timeseries[datestamp][i], minType) else: v = float(timeseries[datestamp][i]) if int(v) == -9998 or int(v) == -9999: values.append(np.nan) else: values.append(v) else: for v in timeseries[datestamp]: v = float(v) if int(v) == -9998 or int(v) == -9999: values.append(np.nan) else: values.append(v) else: values = missing_data data.append(values) i_date = i_date + td for i in range(0, len(data)): data[i] = np.asarray(data[i], dtype=np.float64) temp_data = data.copy() data_df = pd.DataFrame(data, columns=columns, dtype=np.float64) merge = True for c in columns: if interpolate in ["linear", "slinear", "quadratic", "cubic", "values"]: data_df[c] = data_df[c].interpolate(method=interpolate).ffill().bfill() elif interpolate in ["polynomial", "spline"]: data_df[c] = data_df[c].interpolate(method=interpolate, order=4).ffill().bfill() elif interpolate == "gaussian": merge = False df = df.join(data_df, how='outer') df = self.random_gaussian(df, columns) else: data_df[c] = data_df[c].fillna(method=interpolate).ffill().bfill() if merge: df = df.join(data_df, how='outer') if plot: plot_data = pd.DataFrame() plot_columns = columns for i in range(0, len(columns)): c = columns[i] c0 = c + "_0" d_i = df[c] plot_data[c] = temp_data[:, i] plot_data[c0] = d_i plot_columns.append(c0) x = pd.to_datetime(df[["year", "month", "day", "hour"]]) plot_data["datetime"] = x plot_data.set_index('datetime') colors = ['b', 'm', 'g', 'c', 'y', 'k'] ax = plot_data.plot(x='datetime', y=plot_columns[0], linewidth=1.0, label=plot_columns[0], color=colors[0], figsize=(16, 8)) plot_data.plot(x='datetime', y=plot_columns[0], linewidth=1.0, label=plot_columns[0], color=colors[0], figsize=(16, 8)) for c in range(1, len(plot_columns)): plot_data.plot(x='datetime', y=plot_columns[c], linewidth=1.0, label=plot_columns[c], color=colors[0], figsize=(16, 8)) plot_data.plot(x='datetime', y=plot_columns[c], linewidth=0.5, label=plot_columns[c], color=colors[c], ax=ax) ax.set_title("{} - {} interpolation".format(title, interpolate)) plt.show() return df