Python DataFrame.join Examples, pandas.DataFrame.join Python Examples

Example #1

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
                                         [1, 2, 3, 1, 2, 3]],
                                        names=['first', 'second'])

        index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
                                         [1, 2, 3, 1, 2, 3]],
                                        names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6), index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6), index=index2,
                        columns=['var Y'])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how='outer')
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how='outer').sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

Example #2

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]],
                                            names=['abc', 'xy', 'num'])
        left = DataFrame({'v1': range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product([list('abc'), list('xy')],
                                             names=['abc', 'xy'])
        right = DataFrame({'v2': [100 * i for i in range(1, 7)]},
                          index=rightindex)

        result = left.join(right, on=['abc', 'xy'], how=join_type)
        expected = (left.reset_index()
                        .merge(right.reset_index(),
                               on=['abc', 'xy'], how=join_type)
                        .set_index(['abc', 'xy', 'num'])
                    )
        assert_frame_equal(expected, result)

        msg = (r'len\(left_on\) must equal the number of levels in the index'
               ' of "right"')
        with pytest.raises(ValueError, match=msg):
            left.join(right, on='xy', how=join_type)

        with pytest.raises(ValueError, match=msg):
            right.join(left, on=['abc', 'xy'], how=join_type)

Example #3

0

Show file

File: bike.py Project: kowalczewski/Bike

def plots_workingTrends():

	# holiday = 0 and workday = 0 => weekend
	# let's see if holidays and weekends give the same trends

	# Day trends -- working vs. non-working day
	hours = np.linspace(0,23,24)

	days_average = DataFrame({'Hour': hours})

	# workdays
	mean_vec = []
	for hour in hours:
		mean_vec.append(bike_data[ (bike_data["workingday"] == 1) & (bike_data["time"] == hour) ].mean()['count'])
	days_average = days_average.join(DataFrame({'Working day': mean_vec}))

	# holidays or weekends
	mean_vec = []
	for hour in hours:
		mean_vec.append(bike_data[ (bike_data["workingday"] == 0) & (bike_data["time"] == hour) ].mean()['count'])
	days_average = days_average.join(DataFrame({'Non-working day': mean_vec}))

	days_average.drop('Hour',axis=1).plot(figsize=(12, 6), linewidth=3, fontsize=16)
	plt.xlabel('Hour', fontsize=16)
	plt.ylabel('Average counts', fontsize=16)
	plt.legend(loc='best', fontsize=16)
	plt.show()

Example #4

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

    def test_join_on_singlekey_list(self):
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])

        # corner cases
        joined = df.join(df2, on=['key'])
        expected = df.join(df2, on='key')

        assert_frame_equal(joined, expected)

Example #5

0

Show file

File: test_multilevel.py Project: eisenkdr/pandas

 def test_join_segfault(self):
     # 1532
     df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]})
     df2 = DataFrame({'a': [2, 2], 'b': [1, 2], 'y': [1, 2]})
     df1 = df1.set_index(['a', 'b'])
     df2 = df2.set_index(['a', 'b'])
     # it works!
     for how in ['left', 'right', 'outer']:
         df1.join(df2, how=how)

Example #6

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(randn(30, 2), columns=['a', 'b'])
        c = Series(randn(30))
        a['c'] = c
        d = DataFrame(randn(30, 1), columns=['q'])

        # it works!
        a.join(d)
        d.join(a)

Example #7

0

Show file

File: join_merge.py Project: TomAugspurger/pandas

class JoinIndex(object):

    def setup(self):
        N = 50000
        self.left = DataFrame(np.random.randint(1, N / 500, (N, 2)),
                              columns=['jim', 'joe'])
        self.right = DataFrame(np.random.randint(1, N / 500, (N, 2)),
                               columns=['jolie', 'jolia']).set_index('jolie')

    def time_left_outer_join_index(self):
        self.left.join(self.right, on='jim')

Example #8

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

    def test_join_on_inner(self):
        df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])

        joined = df.join(df2, on='key', how='inner')

        expected = df.join(df2, on='key')
        expected = expected[expected['value'].notna()]
        tm.assert_series_equal(joined['key'], expected['key'],
                               check_dtype=False)
        tm.assert_series_equal(joined['value'], expected['value'],
                               check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)

Example #9

0

Show file

File: Pull_Data.py Project: cajohnst/Optimized_FX_Portfolio

def merge_with_technicals(currency_list, returns_table, fundamentals_table, RSI, MACD, Stochastics, beg_date, stoch_date):
	# Create empty list, will hold dataframes for all currencies
	dataframe_list = []
	for currency in currency_list:
		buildup_dataframe = DataFrame(returns_table[currency])
		buildup_dataframe = buildup_dataframe.join(fundamentals_table, how= 'left', rsuffix= '')
		buildup_dataframe = buildup_dataframe.join(RSI[currency], how= 'left', rsuffix= '_RSI')
		buildup_dataframe = buildup_dataframe.join(MACD[currency], how='left', rsuffix='_MACD')
		if beg_date > stoch_date:
			buildup_dataframe = buildup_dataframe.join(Stochastics[currency], how='left', rsuffix='_Stoch')
		dataframe_list.append(buildup_dataframe)

	return dataframe_list

Example #10

0

Show file

File: bike.py Project: kowalczewski/Bike

def read_data(test = False):

    if (test):
        filename = 'test.csv'
    else:
        filename = 'train.csv'
    
    # read data; output: dataframe
    data = pd.read_csv(filename)

    # split datetime into date and time
    date = []
    time = []
    for row in data['datetime']:
        row = row.split()
        date.append(row[0])
        time.append(int(row[1].split(':')[0]))

    date_and_time = DataFrame({'date': date,
                               'time': time})

    del data['datetime']
    data = date_and_time.join(data)

    # add day of the week
    day = []
    # https://docs.python.org/2/library/datetime.html
    # .strftime('%A') -- sets proper format
    for row in data['date']:
        day.append(datetime.datetime.strptime(row, '%Y-%m-%d').strftime('%A'))

    data = DataFrame({'day': day}).join(data)
    
    # split date into year | month | dayMonth
    year = []
    month = []
    dayMonth = []
    for row in data['date']:
        row = row.split('-')
        year.append(int(row[0]))
        month.append(int(row[1]))
        dayMonth.append(int(row[2]))

    year_month_day = DataFrame({'year' : year,
                                'month': month,
                                'dayMonth' : dayMonth})

    del data['date']
    data = year_month_day.join(data)
    
    return data

Example #11

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame(
            {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                        'C': np.random.randn(8),
                        'D': np.random.randn(8)})
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2), name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)

Example #12

0

Show file

File: GEOparse.py Project: mfiers/GEOparse

def parse_GDS_columns(lines, subsets):
    """Parse list of line with columns description from SOFT file
    of GDS (GEO Dataset)

    :param lines: iterable -- iterator over lines
    :returns: pandas.DataFrame -- columns description

    """
    data = []
    index = []
    for line in lines:
        line = line.rstrip()
        if line.startswith("#"):
            tmp = __parse_entry(line)
            data.append(tmp[1])
            index.append(tmp[0])

    df = DataFrame(data, index=index, columns=['description'])
    subset_ids = {"disease_state": {}, "individual": {}}
    for subsetname, subset in subsets.iteritems():
        for expid in subset.metadata["sample_id"][0].split(","):
            if subset.get_type() == "disease state":
                subset_ids["disease_state"][expid] = subset.metadata["description"][0]
            elif subset.get_type() == "individual":
                subset_ids["individual"][expid] = subset.metadata["description"][0]
            else:
                stderr("Unknown subset type: %s for subset %s\n" % (subset.get_type(), subsetname))

    return df.join(DataFrame(subset_ids))

Example #13

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

    def test_join_sort(self):
        left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
                          'value': [1, 2, 3, 4]})
        right = DataFrame({'value2': ['a', 'b', 'c']},
                          index=['bar', 'baz', 'foo'])

        joined = left.join(right, on='key', sort=True)
        expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'],
                              'value': [2, 3, 1, 4],
                              'value2': ['a', 'b', 'c', 'c']},
                             index=[1, 2, 0, 3])
        assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on='key', sort=False)
        tm.assert_index_equal(joined.index, pd.Index(lrange(4)))

Example #14

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
        df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame({'a': [1, 2, 3, 3, 4],
                              'b': [5, np.nan, 6, 7, np.nan]},
                             index=[1, 2, 3, 3, 'a'])
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
        df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]},
                             index=[1, 2, 2, 'a'])
        tm.assert_frame_equal(result, expected)

Example #15

0

Show file

File: models.py Project: nditech/elections

    def to_dataframe(self, selected_fields=None, excluded_fields=None):
        from ..services import locations

        if excluded_fields:
            qs = self.exclude(*excluded_fields)
        else:
            qs = self.exclude(*self.DEFAULT_EXCLUDED_FIELDS)
        if selected_fields:
            qs = self.only(*selected_fields)

        df = DataFrame(list(qs.as_pymongo())).convert_objects(convert_numeric=True)
        if df.empty:
            return df

        # add fields with no values
        fields = filter(
            lambda f: f not in df.columns,
            map(lambda field: field.name, [field for group in self.first().form.groups for field in group.fields]),
        )

        for field in fields:
            df[field] = Series(np.nan, index=df.index)

        # do cleanup of subdocument fields
        for field in self.SUBDOCUMENT_FIELDS:
            temp = df.pop(field).tolist()
            temp2 = [i if not isnull(i) else {} for i in temp]
            df = df.join(DataFrame(temp2))

        rv_map = locations.registered_voters_map()

        df["registered_voters"] = df.location.apply(lambda i: rv_map.get(i, 0))

        return df

Example #16

0

Show file

File: wrangling.py Project: dalejung/trtools

def foreach_dataframe(self, func, force_dict=False, *args, **kwargs):
    """
        Really just does a foreach with each being dfs in a panel. 
    """
    d = {}
    for key, df in self.items():
        d[key] = func(df, *args, **kwargs)
    container = PanelDict
    for key, result in list(d.items()):
        if isinstance(result, Series):
            container = DataFrame
            break
        if isinstance(result, DataFrame):
            container = Panel
            break

    index = []
    for key, result in list(d.items()):
        if not isinstance(result, (DataFrame, Series)):
            continue
        result.name = key
        ind = result.index
        index = set(index).union(ind) 

    if force_dict:
        return PanelDict(d)

    res = DataFrame(None, index=index)
    for key, result in list(d.items()):
        res = res.join(result)

    res = res.sort()
    return res

Example #17

0

Show file

File: test_timezones.py Project: techtommey/pandas

    def test_join_aware(self):
        rng = date_range('1/1/2011', periods=10, freq='H')
        ts = Series(np.random.randn(len(rng)), index=rng)

        ts_utc = ts.tz_localize('utc')

        self.assertRaises(Exception, ts.__add__, ts_utc)
        self.assertRaises(Exception, ts_utc.__add__, ts)

        test1 = DataFrame(np.zeros((6,3)),
                          index=date_range("2012-11-15 00:00:00", periods=6,
                                           freq="100L", tz="US/Central"))
        test2 = DataFrame(np.zeros((3,3)),
                          index=date_range("2012-11-15 00:00:00", periods=3,
                                           freq="250L", tz="US/Central"),
                          columns=range(3,6))

        result = test1.join(test2, how='outer')
        ex_index = test1.index.union(test2.index)

        self.assertTrue(result.index.equals(ex_index))
        self.assertTrue(result.index.tz.zone == 'US/Central')

        # non-overlapping
        rng = date_range("2012-11-15 00:00:00", periods=6,
                         freq="H", tz="US/Central")

        rng2 = date_range("2012-11-15 12:00:00", periods=6,
                         freq="H", tz="US/Eastern")

        result = rng.union(rng2)
        self.assertTrue(result.tz.zone == 'UTC')

Example #18

0

Show file

File: grid_generate.py Project: rougeth/LQTAgridPy

 def saveGrid(self,output):
     arq = open(output+'.txt', "w")
     arq.write(self.output)
     arq.close()
     dfCoulomb = DataFrame(self.coulombMatrix, columns = self.cCoulomb, index = self.molecules)
     dfLj = DataFrame(self.ljMatrix, columns = self.cLJ, index = self.molecules)
     df = dfCoulomb.join(dfLj)
     df.to_csv(output+'.csv', sep =';')

Example #19

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1['bool'] = True
        df1['string'] = 'foo'

        df2 = DataFrame(index=np.arange(5, 15))
        df2['int'] = 1
        df2['float'] = 1.

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        assert_frame_equal(joined, expected)

Example #20

0

Show file

File: test_multi.py Project: DusanMilunovic/pandas

    def test_left_join_index_preserve_order(self):

        on_cols = ['k1', 'k2']
        left = DataFrame({'k1': [0, 1, 2] * 8,
                          'k2': ['foo', 'bar'] * 12,
                          'v': np.array(np.arange(24), dtype=np.int64)})

        index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
        right = DataFrame({'v2': [5, 7]}, index=index)

        result = left.join(right, on=on_cols)

        expected = left.copy()
        expected['v2'] = np.nan
        expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
        expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7

        tm.assert_frame_equal(result, expected)

        result.sort_values(on_cols, kind='mergesort', inplace=True)
        expected = left.join(right, on=on_cols, sort=True)

        tm.assert_frame_equal(result, expected)

        # test join with multi dtypes blocks
        left = DataFrame({'k1': [0, 1, 2] * 8,
                          'k2': ['foo', 'bar'] * 12,
                          'k3': np.array([0, 1, 2] * 8, dtype=np.float32),
                          'v': np.array(np.arange(24), dtype=np.int32)})

        index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
        right = DataFrame({'v2': [5, 7]}, index=index)

        result = left.join(right, on=on_cols)

        expected = left.copy()
        expected['v2'] = np.nan
        expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
        expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7

        tm.assert_frame_equal(result, expected)

        result = result.sort_values(on_cols, kind='mergesort')
        expected = left.join(right, on=on_cols, sort=True)

        tm.assert_frame_equal(result, expected)

Example #21

0

Show file

File: tsplotter.py Project: waldenven/tsdata

    def dataframe(self):
	tss   = self.eval()
	df  = DataFrame()
	# FIXME: should do something about potential for dupe names
	for ts,h in zip(tss, self.hidden):
	    if not h and type(ts) != type(''):
		df = df.join(ts,how='outer')
	return df

Example #22

0

Show file

File: test_multi.py Project: DusanMilunovic/pandas

    def test_left_join_index_multi_match(self):
        left = DataFrame([
            ['c', 0],
            ['b', 1],
            ['a', 2],
            ['b', 3]],
            columns=['tag', 'val'],
            index=[2, 0, 1, 3])

        right = (DataFrame([
            ['a', 'v'],
            ['c', 'w'],
            ['c', 'x'],
            ['d', 'y'],
            ['a', 'z'],
            ['c', 'r'],
            ['e', 'q'],
            ['c', 's']],
            columns=['tag', 'char'])
            .set_index('tag'))

        result = left.join(right, on='tag', how='left')

        expected = DataFrame([
            ['c', 0, 'w'],
            ['c', 0, 'x'],
            ['c', 0, 'r'],
            ['c', 0, 's'],
            ['b', 1, nan],
            ['a', 2, 'v'],
            ['a', 2, 'z'],
            ['b', 3, nan]],
            columns=['tag', 'val', 'char'],
            index=[2, 2, 2, 2, 0, 1, 1, 3])

        tm.assert_frame_equal(result, expected)

        result = left.join(right, on='tag', how='left', sort=True)
        expected2 = expected.sort_values('tag', kind='mergesort')

        tm.assert_frame_equal(result, expected2)

        # GH7331 - maintain left frame order in left merge
        result = merge(left, right.reset_index(), how='left', on='tag')
        expected.index = np.arange(len(expected))
        tm.assert_frame_equal(result, expected)

Example #23

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

 def test_join_on_series_buglet(self):
     # GH #638
     df = DataFrame({'a': [1, 1]})
     ds = Series([2], index=[1], name='b')
     result = df.join(ds, on='a')
     expected = DataFrame({'a': [1, 1],
                           'b': [2, 2]}, index=df.index)
     tm.assert_frame_equal(result, expected)

Example #24

0

Show file

File: transformation.py Project: WGierke/erosion_data_analytics

def encode_onehot(df: pd.DataFrame, cols):
    vec = DictVectorizer()
    vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(outtype='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index

    df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df

Example #25

0

Show file

File: test_combine_concat.py Project: brianholland/pandas

    def test_join_str_datetime(self):
        str_dates = ['20120209', '20120222']
        dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]

        A = DataFrame(str_dates, index=lrange(2), columns=['aa'])
        C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)

        tst = A.join(C, on='aa')

        assert len(tst.columns) == 3

Example #26

0

Show file

File: test_perf.py Project: AbhijitBadve/pandas

def get_results_df(db, rev):
    """Takes a git commit hash and returns a Dataframe of benchmark results
    """
    bench = DataFrame(db.get_benchmarks())
    results = DataFrame(map(list,db.get_rev_results(rev).values()))

    # Sinch vbench.db._reg_rev_results returns an unlabeled dict,
    # we have to break encapsulation a bit.
    results.columns = db._results.c.keys()
    results = results.join(bench['name'], on='checksum').set_index("checksum")
    return results

Example #27

0

Show file

File: modified0905_get_KETI_Motes.py Project: trivi9ri/2016KETI

def runnig_check():
	result = DataFrame()
	tmp = DataFrame()
	for i in range(0,3):
		if i == 0:
			result = make_keti_data_to_df(i)
		else:
			tmp = result
			result = tmp.join(make_keti_data_to_df(i))
		time.sleep(2)
	return result

Example #28

0

Show file

File: tank.py Project: cpcloud/span

    def _read_tsq(self, event_name):
        """Read the metadata (TSQ) file of a TDT Tank.

        Returns
        -------
        b : pandas.DataFrame
            Recording metadata
        """
        # create the path name
        tsq_name = self.path + os.extsep + self.header_ext

        # read in the raw data as a numpy rec array and convert to DataFrame
        b = DataFrame(np.fromfile(tsq_name, dtype=self.tsq_dtype))

        # zero based indexing
        b.channel -= 1
        b.channel = b.channel.astype(f8)

        # -1s are invalid
        b.channel[b.channel == -1] = np.nan

        b.type = EventTypes[b.type].reset_index(drop=True)
        b.format = DataTypes[b.format].reset_index(drop=True)

        b.timestamp[np.logical_not(b.timestamp)] = np.nan
        b.fs[np.logical_not(b.fs)] = np.nan

        # fragile subtraction (i.e., what if TDT changes this value?)
        b.size -= 10

        # create some new indices based on the electrode array
        srt = Indexer.sort('channel').reset_index(drop=True)
        shank = srt.shank[b.channel].reset_index(drop=True)

        tsq = b.join(shank)

        # convert the event_name to a number
        name = name2num(event_name)

        # get the row of the metadata where its value equals the name-number
        row = tsq.name == name

        # make sure there's at least one event
        assert row.any(), 'no event named %s in tank: %s' % (event_name,
                                                             self.path)

        # get all the metadata for those events
        tsq = tsq[row]

        # convert to integer where possible
        tsq.channel = tsq.channel.astype(int)
        tsq.shank = tsq.shank.astype(int)

        return tsq, row

Example #29

0

Show file

File: bike.py Project: kowalczewski/Bike

def plots_casRegTrends():

	hours = np.linspace(0,23,24)
	days_average = DataFrame({'Hour': hours})

	mean_vec = []
	for hour in hours:
		mean_vec.append(bike_data[ (bike_data["time"] == hour) ].mean()['casual'])
	days_average = days_average.join(DataFrame({'Casual': mean_vec}))

	mean_vec = []
	for hour in hours:
		mean_vec.append(bike_data[ (bike_data["time"] == hour) ].mean()['registered'])
	days_average = days_average.join(DataFrame({'Registered': mean_vec}))

	days_average.drop('Hour',axis=1).plot(figsize=(12, 6), linewidth=3, fontsize=16)
	plt.xlabel('Hour', fontsize=16)
	plt.ylabel('Average counts', fontsize=16)
	plt.legend(loc='best', fontsize=16)
	plt.show()

Example #30

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

 def test_join_non_unique_period_index(self):
     # GH #16871
     index = pd.period_range('2016-01-01', periods=16, freq='M')
     df = DataFrame([i for i in range(len(index))],
                    index=index, columns=['pnum'])
     df2 = concat([df, df])
     result = df.join(df2, how='inner', rsuffix='_df2')
     expected = DataFrame(
         np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
         columns=['pnum', 'pnum_df2'], index=df2.sort_index().index)
     tm.assert_frame_equal(result, expected)

Example #31

0

Show file

#scaler = MinMaxScaler(feature_range=(0, 1))
#scaler2 = MinMaxScaler(feature_range=(0, 1))
scale_X =df.loc[:,["Daily_data","Hourly_data","Monthly_data","Pre_year_data"]]
scale_Y =df.loc[:,["Label_year_data"]]
scalerX = scaler.fit(scale_X)
scalery = scaler.fit(scale_Y)

scaled_X = scalerX.transform(scale_X)
scaled_X = DataFrame(scaled_X)
scaled_X.columns=["Daily_data","Hourly_data","Monthly_data","Pre_year_data"]
scaled_Y = scalery.transform(scale_Y)
scaled_Y = DataFrame(scaled_Y)
scaled_Y.columns=["Label_year_data"]

###adding time sig and cos
x = scaled_X.join(df.loc[:,["Hour","Month"]])
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data
#x = encode(x, 'Hour', 23)
x=encode(x,"Month",12)
x=x.drop(["Hour"],axis=1)
x=x.drop(["Month"],axis=1)
#train_x, test_x ,X_val= x[:(len(scaled_X)-n_val-n_test)], x[-n_test:], x[n_val:(n_test+n_val)]

train_x, test_x ,X_val= scaled_X[:(len(scaled_X)-n_val-n_test)], scaled_X[-n_test:], scaled_X[n_val:(n_test+n_val)]
train_y, test_y,y_val = scaled_Y[:(len(scaled_X)-n_val-n_test)], scaled_Y[-n_test:],scaled_Y[n_val:(n_test+n_val)]


print(train_x.shape)

Example #32

0

Show file

def _extracting_coordinates(dataframe: pd.DataFrame) -> pd.DataFrame:
    expanded_cols = pd.DataFrame(dataframe['coordenadas'].values.tolist(),
                                 columns=['latitude', 'longitude'])

    return dataframe.join(expanded_cols).drop('coordenadas', axis=1)

Example #33

0

Show file

File: predict.py Project: Shumpei-Kikuta/TitanicApi

def generate_onehot_encoding(data: pd.DataFrame, column_name: str, drop=True):
    onehot_repr = pd.get_dummies(data[column_name])
    data = data.join(onehot_repr)
    data.drop(column_name, axis=1, inplace=True)
    return data

Example #34

0

Show file

File: data_merge.py Project: epicarts/python3_practice

                   columns=['event1', 'event2'])

lefth
righth
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

left2 = DataFrame([[1, 2], [3, 4], [5, 6]],
                  index=['a', 'c', 'e'],
                  columns=['Ohio', 'Nevada'])
right2 = DataFrame([[7, 8], [9, 10], [11, 12], [13, 14]],
                   index=['b', 'c', 'd', 'e'],
                   columns=['Missouri', 'Alabma'])
right2
left2
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
left2.join(right2, how='outer')  #join 메서드는 칼럼이 켭치지 않고 완전히 같거나 유사한 색인구조 통합
left1.join(right1, on='key')
another = DataFrame([[7, 8], [9, 10], [11, 12], [16, 17]],
                    index=['a', 'c', 'e', 'f'],
                    columns=['New York', 'Oregon'])
another
left2.join([right2, another])
right2
left2
left2.join([right2, another], how='outer')
'''
합치기전에 고려해야 할 사항
1. 만약 연결하려는 두객체의 색인이 서로 다르다면, 교집합? 합집합 ?
2. 합쳐진 결과에서 합쳐지기전 객체의 데이터를 고려할 수 있음 ?
3. 어떤 축으로 연결할거임?
'''

Example #35

0

Show file

    def test_join_inner_multiindex(self):
        key1 = [
            "bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux",
            "snap"
        ]
        key2 = [
            "two",
            "one",
            "three",
            "one",
            "two",
            "one",
            "two",
            "two",
            "three",
            "one",
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({"key1": key1, "key2": key2, "data": data})

        index = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=["j_one", "j_two", "j_three"])

        joined = data.join(to_join, on=["key1", "key2"], how="inner")
        expected = merge(
            data,
            to_join.reset_index(),
            left_on=["key1", "key2"],
            right_on=["first", "second"],
            how="inner",
            sort=False,
        )

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )
        tm.assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )

        expected = expected.drop(["first", "second"], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic
        tm.assert_frame_equal(joined, expected)

Example #36

0

Show file

File: Sqft_Dataset_NY.py Project: aishvarya87/Python_project

sqft_get_data = [xx for xx in sqft_cursor]
sqft_cursor.close()

# Getting city data in dataframe 
city_data_for_join_df = DataFrame (city_get_data)
city_data_for_join_df.columns = city_field_names

city_df=DataFrame(city_get_data)
city_df.columns = city_field_names

# Getting city sqft data in dataframe
sqft_df = DataFrame(sqft_get_data)
sqft_df.columns = sqft_field_names

# joining city and city sqft data frame
joined_city_sqft=city_data_for_join_df.join(sqft_df.set_index('CityCode'), on='CityCode')

# Transposing the data
master_melted_dataset_df=pandas.melt(joined_city_sqft, id_vars=["CityCode","CityName","Metro","County","State","PopulationRank"])

#Question3
print("Question 3")
full_average=master_melted_dataset_df["value"].mean()
print("Average of Price Sqft Dataset")
print(full_average, "\n")
print("Maximum of Price Sqft Dataset")
print(master_melted_dataset_df["value"].max(), "\n")
print("Minimum of Price Sqft Dataset")
print(master_melted_dataset_df["value"].min(), "\n")

#Question4

Example #37

0

Show file

    def _read_one_data(self, url, params):
        """ read one data from specified symbol """

        symbol = params['symbol']
        del params['symbol']
        url = url.format(symbol)

        resp = self._get_response(url, params=params)
        ptrn = r'root\.App\.main = (.*?);\n}\(this\)\);'
        try:
            j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1))
            data = j['context']['dispatcher']['stores']['HistoricalPriceStore']
        except KeyError:
            msg = 'No data fetched for symbol {} using {}'
            raise RemoteDataError(msg.format(symbol, self.__class__.__name__))

        # price data
        prices = DataFrame(data['prices'])
        prices.columns = [col.capitalize() for col in prices.columns]
        prices['Date'] = to_datetime(
            to_datetime(prices['Date'], unit='s').dt.date)

        if 'Data' in prices.columns:
            prices = prices[prices['Data'].isnull()]
        prices = prices[[
            'Date', 'High', 'Low', 'Open', 'Close', 'Volume', 'Adjclose'
        ]]
        prices = prices.rename(columns={'Adjclose': 'Adj Close'})

        prices = prices.set_index('Date')
        prices = prices.sort_index().dropna(how='all')

        if self.ret_index:
            prices['Ret_Index'] = \
                _calc_return_index(prices['Adj Close'])
        if self.adjust_price:
            prices = _adjust_prices(prices)

        # dividends & splits data
        if self.get_actions and data['eventsData']:

            actions = DataFrame(data['eventsData'])
            actions.columns = [col.capitalize() for col in actions.columns]
            actions['Date'] = to_datetime(
                to_datetime(actions['Date'], unit='s').dt.date)

            types = actions['Type'].unique()
            if 'DIVIDEND' in types:
                divs = actions[actions.Type == 'DIVIDEND'].copy()
                divs = divs[['Date', 'Amount']].reset_index(drop=True)
                divs = divs.set_index('Date')
                divs = divs.rename(columns={'Amount': 'Dividends'})
                prices = prices.join(divs, how='outer')

            if 'SPLIT' in types:
                splits = actions[actions.Type == 'SPLIT'].copy()
                splits['SplitRatio'] = splits['Splitratio'].apply(
                    lambda x: eval(x))
                splits = splits.reset_index(drop=True)
                splits = splits.set_index('Date')
                splits['Splits'] = 1.0 / splits['SplitRatio']
                prices = prices.join(splits['Splits'], how='outer')

                if 'DIVIDEND' in types and not self.adjust_dividends:
                    # Adjust dividends to deal with splits
                    adj = prices['Splits'].sort_index(
                        ascending=False).fillna(1).cumprod()
                    adj = 1.0 / adj
                    prices['Dividends'] = prices['Dividends'] * adj

        return prices

Example #38

0

Show file

File: options.py Project: pmnyc/Source_Codes_Collected

def options_to_rates(options, t_min=1. / 12., n_min=6):
    """
    Extract implied risk-free rates and dividend yield from
    standard European option quote file.

    ignore data:
    - with time to maturity < tMin (in fraction of years)
    - with fewer than nMin quotes per maturity date

    Parameters
    ----------

    t_min: float (default: 1 month)
        Minimum time to maturity in fraction of years
    n_min: int (default: 6)
        minimum number of quotes per maturity date

    """

    grouped = options.groupby(nm.EXPIRY_DATE)

    expiry_dates = []
    implied_interest_rates = []
    implied_dividend_yields = []

    for spec, group in grouped:
        # implied vol for this type/expiry group

        index = group.index

        trade_date = group[nm.TRADE_DATE][index[0]]
        expiry_date = group[nm.EXPIRY_DATE][index[0]]
        spot = group[nm.SPOT][index[0]]
        days_to_expiry = (expiry_date - trade_date).days
        time_to_maturity = days_to_expiry / 365.0

        # exclude groups with too short time to maturity
        if time_to_maturity < t_min:
            continue

        # extract the put and call quotes
        calls = group[group[nm.OPTION_TYPE] == nm.CALL_OPTION]
        puts = group[group[nm.OPTION_TYPE] == nm.PUT_OPTION]

        # exclude groups with too few data points
        if (len(calls) < n_min) | (len(puts) < n_min):
            continue

        # calculate forward, implied interest rate and implied div. yield
        call_premium = DataFrame(
            (calls[nm.PRICE_BID] + calls[nm.PRICE_ASK]) / 2.,
            columns=[CALL_PREMIUM])
        call_premium.index = np.array(calls[nm.STRIKE])

        put_premium = DataFrame((puts[nm.PRICE_BID] + puts[nm.PRICE_ASK]) / 2.,
                                columns=[PUT_PREMIUM])
        put_premium.index = np.array(puts[nm.STRIKE])

        # use 'inner' join because some strikes are not quoted for C and P
        all_quotes = call_premium.join(put_premium, how='inner')
        all_quotes[nm.STRIKE] = all_quotes.index
        all_quotes['C-P'] = all_quotes[CALL_PREMIUM] - all_quotes[PUT_PREMIUM]

        y = np.array(all_quotes['C-P'])
        x = np.array(all_quotes[nm.STRIKE])
        A = np.vstack([x, np.ones(len(x))]).T
        a_1, a_0 = np.linalg.lstsq(A, y)[0]

        # intercept is last coef
        interest_rate = -np.log(-a_1) / time_to_maturity
        dividend_yield = np.log(spot / a_0) / time_to_maturity

        implied_interest_rates.append(interest_rate)
        implied_dividend_yields.append(dividend_yield)
        expiry_dates.append(expiry_date)

    rates = ds.riskfree_dividend_template().reindex(index=expiry_dates)
    rates[nm.INTEREST_RATE] = implied_interest_rates
    rates[nm.DIVIDEND_YIELD] = implied_dividend_yields

    return rates

Example #39

0

Show file

def convert_amenities(df: pd.DataFrame) -> pd.DataFrame:
    one_hot_df = one_hot_encode_amenities(df)
    return df.join(one_hot_df).drop(columns="amenities")

Example #40

0

Show file

File: test_simplified.py Project: miguelsimon/petutils

    XT,
    BarycenterPredictor,
    EMDLoss,
    RndMarginalPredictor,
    Simulator,
    X,
    Y,
)

positions = DataFrame({"sensor_id": [0], "x": [10.0], "y": [10.0], "z": [10.0]})

hits = DataFrame({"event_id": [0], "x": [1.0], "y": [1.0], "z": [1.0], "energy": [1.0]})

waveforms = DataFrame({"sensor_id": [0], "event_id": [0], "charge": [20.0]})

ext_waveforms = waveforms.join(positions.set_index("sensor_id"), on="sensor_id")


class Test(unittest.TestCase):
    def test_constructors(self):
        print(XT(hits))
        print(Y(ext_waveforms))
        print(RndMarginalPredictor(hits))

    def test_simulator(self):
        sim = Simulator(positions, hits, waveforms)
        xt, y = sim.sample()
        print(xt, y)

    def test_emd_loss(self):
        loss = EMDLoss()

Example #41

0

Show file

File: test_join.py Project: PaulGureghian1/Pandas

class TestJoin(object):
    def setup_method(self, method):
        # aggregate multiple columns
        self.df = DataFrame({
            'key1': get_test_data(),
            'key2': get_test_data(),
            'data1': np.random.randn(N),
            'data2': np.random.randn(N)
        })

        # exclude a couple keys for fun
        self.df = self.df[self.df['key2'] > 1]

        self.df2 = DataFrame({
            'key1':
            get_test_data(n=N // 5),
            'key2':
            get_test_data(ngroups=NGROUPS // 2, n=N // 5),
            'value':
            np.random.randn(N // 5)
        })

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({
            'MergedA': data['A'],
            'MergedD': data['D']
        },
                                index=data['C'])

    def test_cython_left_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        ls, rs = libjoin.left_outer_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_(
            [0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
        exp_ri = a_(
            [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_right_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        rs, ls = libjoin.left_outer_join(right, left, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        #            0        1        1        1
        exp_li = a_([
            0,
            1,
            2,
            3,
            4,
            5,
            3,
            4,
            5,
            3,
            4,
            5,
            #            2        2        4
            6,
            7,
            8,
            6,
            7,
            8,
            -1
        ])
        exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_inner_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
        max_group = 5

        ls, rs = libjoin.inner_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
        exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')

        joined_both = merge(self.df, self.df2)
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='left')

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')

        joined_both = merge(self.df, self.df2, how='right')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='right')

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')

        joined_both = merge(self.df, self.df2, how='outer')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='outer')

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')

        joined_both = merge(self.df, self.df2, how='inner')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='inner')

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar'])

        assert 'key1.foo' in joined
        assert 'key1.bar' in joined

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(self.df,
                       self.df2,
                       left_on='key2',
                       right_on='key1',
                       suffixes=['.foo', '.bar'])
        assert 'key1.foo' in joined
        assert 'key2.bar' in joined

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on='C')
        tm.assert_series_equal(merged['MergedA'],
                               target['A'],
                               check_names=False)
        tm.assert_series_equal(merged['MergedD'],
                               target['D'],
                               check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
        joined = df.join(df2, on='key')
        expected = DataFrame({
            'key': ['a', 'a', 'b', 'b', 'c'],
            'value': [0, 0, 1, 1, 2]
        })
        assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]],
                         index=['a', 'b', 'c'],
                         columns=['one'])
        df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two'])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three'])
        joined = df_a.join(df_b, on='one')
        joined = joined.join(df_c, on='one')
        assert np.isnan(joined['two']['c'])
        assert np.isnan(joined['three']['c'])

        # merge column not p resent
        with pytest.raises(KeyError, match="^'E'$"):
            target.join(source, on='E')

        # overlap
        source_copy = source.copy()
        source_copy['A'] = 0
        msg = ("You are trying to merge on float64 and object columns. If"
               " you wish to proceed you should use pd.concat")
        with pytest.raises(ValueError, match=msg):
            target.join(source_copy, on='A')

    def test_join_on_fails_with_different_right_index(self):
        df = DataFrame({
            'a': np.random.choice(['m', 'f'], size=3),
            'b': np.random.randn(3)
        })
        df2 = DataFrame(
            {
                'a': np.random.choice(['m', 'f'], size=10),
                'b': np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2))
        msg = (r'len\(left_on\) must equal the number of levels in the index'
               ' of "right"')
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, left_on='a', right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        df = DataFrame(
            {
                'a': np.random.choice(['m', 'f'], size=3),
                'b': np.random.randn(3)
            },
            index=tm.makeCustomIndex(3, 2))
        df2 = DataFrame({
            'a': np.random.choice(['m', 'f'], size=10),
            'b': np.random.randn(10)
        })
        msg = (r'len\(right_on\) must equal the number of levels in the index'
               ' of "left"')
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on='b', left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        df = DataFrame({
            'a': np.random.choice(['m', 'f'], size=3),
            'b': np.random.randn(3)
        })
        df2 = DataFrame(
            {
                'a': np.random.choice(['m', 'f'], size=10),
                'b': np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2))
        msg = r"len\(right_on\) must equal len\(left_on\)"
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on='a', left_on=['a', 'b'])

    @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])])
    def test_join_on_fails_with_wrong_object_type(self, wrong_type):
        # GH12081 - original issue

        # GH21220 - merging of Series and DataFrame is now allowed
        # Edited test to remove the Series object from test parameters

        df = DataFrame({'a': [1, 1]})
        msg = ("Can only merge Series or DataFrame objects, a {} was passed".
               format(str(type(wrong_type))))
        with pytest.raises(TypeError, match=msg):
            merge(wrong_type, df, left_on='a', right_on='a')
        with pytest.raises(TypeError, match=msg):
            merge(df, wrong_type, left_on='a', right_on='a')

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on='C')
        del expected['C']

        join_col = self.target.pop('C')
        result = self.target.join(self.source, on=join_col)
        assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on='C')
        for col in self.source:
            assert col in merged
            assert merged[col].isna().all()

        merged2 = self.target.join(self.source.reindex([]),
                                   on='C',
                                   how='inner')
        tm.assert_index_equal(merged2.columns, merged.columns)
        assert len(merged2) == 0

    def test_join_on_inner(self):
        df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])

        joined = df.join(df2, on='key', how='inner')

        expected = df.join(df2, on='key')
        expected = expected[expected['value'].notna()]
        tm.assert_series_equal(joined['key'],
                               expected['key'],
                               check_dtype=False)
        tm.assert_series_equal(joined['value'],
                               expected['value'],
                               check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])

        # corner cases
        joined = df.join(df2, on=['key'])
        expected = df.join(df2, on='key')

        assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source['MergedA'], on='C')
        expected = self.target.join(self.source[['MergedA']], on='C')
        assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({'a': [1, 1]})
        ds = Series([2], index=[1], name='b')
        result = df.join(ds, on='a')
        expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1['bool'] = True
        df1['string'] = 'foo'

        df2 = DataFrame(index=np.arange(5, 15))
        df2['int'] = 1
        df2['float'] = 1.

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        assert_frame_equal(joined, expected)

    def test_join_index_mixed_overlap(self):
        df1 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(10),
                        columns=['A', 'B', 'C', 'D'])
        assert df1['B'].dtype == np.int64
        assert df1['D'].dtype == np.bool_

        df2 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(0, 10, 2),
                        columns=['A', 'B', 'C', 'D'])

        # overlap
        joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
        expected_columns = [
            'A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two',
            'D_two'
        ]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=['A']), how='outer')

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(randn(30, 2), columns=['a', 'b'])
        c = Series(randn(30))
        a['c'] = c
        d = DataFrame(randn(30, 1), columns=['q'])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays(
            [['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        index2 = MultiIndex.from_arrays(
            [['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6),
                        index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6),
                        index=index2,
                        columns=['var Y'])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how='outer')
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how='outer').sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

    def test_join_inner_multiindex(self):
        key1 = [
            'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux',
            'snap'
        ]
        key2 = [
            'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three',
            'one'
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({'key1': key1, 'key2': key2, 'data': data})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                  [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=['j_one', 'j_two', 'j_three'])

        joined = data.join(to_join, on=['key1', 'key2'], how='inner')
        expected = merge(data,
                         to_join.reset_index(),
                         left_on=['key1', 'key2'],
                         right_on=['first', 'second'],
                         how='inner',
                         sort=False)

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)

        expected = expected.drop(['first', 'second'], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic
        assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
        new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
        other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
        other_df.set_index('a', inplace=True)
        # GH 9455, 12219
        with tm.assert_produces_warning(UserWarning):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        assert ('b', 'mean') in result
        assert 'b' in result

    def test_join_float64_float32(self):

        a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
        b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
        joined = a.join(b)
        assert joined.dtypes['a'] == 'float64'
        assert joined.dtypes['b'] == 'float64'
        assert joined.dtypes['c'] == 'float32'

        a = np.random.randint(0, 5, 100).astype('int64')
        b = np.random.random(100).astype('float64')
        c = np.random.random(100).astype('float32')
        df = DataFrame({'a': a, 'b': b, 'c': c})
        xpdf = DataFrame({'a': a, 'b': b, 'c': c})
        s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
        rs = df.merge(s, left_on='a', right_index=True)
        assert rs.dtypes['a'] == 'int64'
        assert rs.dtypes['b'] == 'float64'
        assert rs.dtypes['c'] == 'float32'
        assert rs.dtypes['md'] == 'float32'

        xp = xpdf.merge(s, left_on='a', right_index=True)
        assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({
            "a": [1, 1, 1],
            "b": [1, 1, 2],
            "e": [1000, 2000, 3000]
        })
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.random.randn(8)
        })
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2),
                   name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({
            'key': ['foo', 'bar', 'baz', 'foo'],
            'value': [1, 2, 3, 4]
        })
        right = DataFrame({'value2': ['a', 'b', 'c']},
                          index=['bar', 'baz', 'foo'])

        joined = left.join(right, on='key', sort=True)
        expected = DataFrame(
            {
                'key': ['bar', 'baz', 'foo', 'foo'],
                'value': [2, 3, 1, 4],
                'value2': ['a', 'b', 'c', 'c']
            },
            index=[1, 2, 0, 3])
        assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on='key', sort=False)
        tm.assert_index_equal(joined.index, pd.Index(lrange(4)))

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
        df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame(
            {
                'a': [1, 2, 3, 3, 4],
                'b': [5, np.nan, 6, 7, np.nan]
            },
            index=[1, 2, 3, 3, 'a'])
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
        df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({
            'a': [1, 2, 3, 4],
            'b': [5, 6, 6, np.nan]
        },
                             index=[1, 2, 2, 'a'])
        tm.assert_frame_equal(result, expected)

    def test_join_non_unique_period_index(self):
        # GH #16871
        index = pd.period_range('2016-01-01', periods=16, freq='M')
        df = DataFrame([i for i in range(len(index))],
                       index=index,
                       columns=['pnum'])
        df2 = concat([df, df])
        result = df.join(df2, how='inner', rsuffix='_df2')
        expected = DataFrame(np.tile(
            np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
                             columns=['pnum', 'pnum_df2'],
                             index=df2.sort_index().index)
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=['a', 'b', 'c', 'd', 'e', 'f'])
        df.insert(0, 'id', 0)
        df.insert(5, 'dt', 'foo')

        grouped = df.groupby('id')
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix='_right')

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
        df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [
            df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9]
        ]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how='outer')
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how='inner')
        _check_diff_index(df_list, joined, df.index[2:8])

        msg = "Joining multiple DataFrames only supported for joining on index"
        with pytest.raises(ValueError, match=msg):
            df_list[0].join(df_list[1:], on='a')

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
        df['key'] = ['foo', 'bar'] * 4
        df1 = df.loc[:, ['A', 'B']]
        df2 = df.loc[:, ['C', 'D']]
        df3 = df.loc[:, ['key']]

        result = df1.join([df2, df3])
        assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat([
            DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']),
            DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
                      columns=['A', 'C'])
        ],
                    axis=1)

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix='_2')
        result.columns = expected.columns
        assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True,
                      right_index=True).merge(z,
                                              left_index=True,
                                              right_index=True,
                                              how="outer")
        dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = [
            'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'
        ]
        assert_frame_equal(dta, expected)

    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product(
            [list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num'])
        left = DataFrame({'v1': range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product(
            [list('abc'), list('xy')], names=['abc', 'xy'])
        right = DataFrame({'v2': [100 * i for i in range(1, 7)]},
                          index=rightindex)

        result = left.join(right, on=['abc', 'xy'], how=join_type)
        expected = (left.reset_index().merge(right.reset_index(),
                                             on=['abc', 'xy'],
                                             how=join_type).set_index(
                                                 ['abc', 'xy', 'num']))
        assert_frame_equal(expected, result)

        msg = (r'len\(left_on\) must equal the number of levels in the index'
               ' of "right"')
        with pytest.raises(ValueError, match=msg):
            left.join(right, on='xy', how=join_type)

        with pytest.raises(ValueError, match=msg):
            right.join(left, on=['abc', 'xy'], how=join_type)

    def test_join_on_tz_aware_datetimeindex(self):
        # GH 23931
        df1 = pd.DataFrame({
            'date':
            pd.date_range(start='2018-01-01', periods=5, tz='America/Chicago'),
            'vals':
            list('abcde')
        })

        df2 = pd.DataFrame({
            'date':
            pd.date_range(start='2018-01-03', periods=5, tz='America/Chicago'),
            'vals_2':
            list('tuvwx')
        })
        result = df1.join(df2.set_index('date'), on='date')
        expected = df1.copy()
        expected['vals_2'] = pd.Series([np.nan] * len(expected), dtype=object)
        assert_frame_equal(result, expected)

Example #42

0

Show file

class TestJoin(tm.TestCase):
    def setUp(self):
        # aggregate multiple columns
        self.df = DataFrame({
            'key1': get_test_data(),
            'key2': get_test_data(),
            'data1': np.random.randn(N),
            'data2': np.random.randn(N)
        })

        # exclude a couple keys for fun
        self.df = self.df[self.df['key2'] > 1]

        self.df2 = DataFrame({
            'key1':
            get_test_data(n=N // 5),
            'key2':
            get_test_data(ngroups=NGROUPS // 2, n=N // 5),
            'value':
            np.random.randn(N // 5)
        })

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({
            'MergedA': data['A'],
            'MergedD': data['D']
        },
                                index=data['C'])

    def test_cython_left_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        ls, rs = _join.left_outer_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_(
            [0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
        exp_ri = a_(
            [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_right_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        rs, ls = _join.left_outer_join(right, left, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        #            0        1        1        1
        exp_li = a_([
            0,
            1,
            2,
            3,
            4,
            5,
            3,
            4,
            5,
            3,
            4,
            5,
            #            2        2        4
            6,
            7,
            8,
            6,
            7,
            8,
            -1
        ])
        exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_inner_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
        max_group = 5

        ls, rs = _join.inner_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
        exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')

        joined_both = merge(self.df, self.df2)
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='left')

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')

        joined_both = merge(self.df, self.df2, how='right')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='right')

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')

        joined_both = merge(self.df, self.df2, how='outer')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='outer')

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')

        joined_both = merge(self.df, self.df2, how='inner')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='inner')

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar'])

        self.assertIn('key1.foo', joined)
        self.assertIn('key1.bar', joined)

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(self.df,
                       self.df2,
                       left_on='key2',
                       right_on='key1',
                       suffixes=['.foo', '.bar'])
        self.assertIn('key1.foo', joined)
        self.assertIn('key2.bar', joined)

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on='C')
        self.assert_series_equal(merged['MergedA'],
                                 target['A'],
                                 check_names=False)
        self.assert_series_equal(merged['MergedD'],
                                 target['D'],
                                 check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
        joined = df.join(df2, on='key')
        expected = DataFrame({
            'key': ['a', 'a', 'b', 'b', 'c'],
            'value': [0, 0, 1, 1, 2]
        })
        assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]],
                         index=['a', 'b', 'c'],
                         columns=['one'])
        df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two'])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three'])
        joined = df_a.join(df_b, on='one')
        joined = joined.join(df_c, on='one')
        self.assertTrue(np.isnan(joined['two']['c']))
        self.assertTrue(np.isnan(joined['three']['c']))

        # merge column not p resent
        self.assertRaises(KeyError, target.join, source, on='E')

        # overlap
        source_copy = source.copy()
        source_copy['A'] = 0
        self.assertRaises(ValueError, target.join, source_copy, on='A')

    def test_join_on_fails_with_different_right_index(self):
        with tm.assertRaises(ValueError):
            df = DataFrame({
                'a': np.random.choice(['m', 'f'], size=3),
                'b': np.random.randn(3)
            })
            df2 = DataFrame(
                {
                    'a': np.random.choice(['m', 'f'], size=10),
                    'b': np.random.randn(10)
                },
                index=tm.makeCustomIndex(10, 2))
            merge(df, df2, left_on='a', right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        with tm.assertRaises(ValueError):
            df = DataFrame(
                {
                    'a': np.random.choice(['m', 'f'], size=3),
                    'b': np.random.randn(3)
                },
                index=tm.makeCustomIndex(10, 2))
            df2 = DataFrame({
                'a': np.random.choice(['m', 'f'], size=10),
                'b': np.random.randn(10)
            })
            merge(df, df2, right_on='b', left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        with tm.assertRaises(ValueError):
            df = DataFrame({
                'a': np.random.choice(['m', 'f'], size=3),
                'b': np.random.randn(3)
            })
            df2 = DataFrame(
                {
                    'a': np.random.choice(['m', 'f'], size=10),
                    'b': np.random.randn(10)
                },
                index=tm.makeCustomIndex(10, 2))
            merge(df, df2, right_on='a', left_on=['a', 'b'])

    def test_join_on_fails_with_wrong_object_type(self):
        # GH12081
        wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])]
        df = DataFrame({'a': [1, 1]})

        for obj in wrongly_typed:
            with tm.assertRaisesRegexp(ValueError, str(type(obj))):
                merge(obj, df, left_on='a', right_on='a')
            with tm.assertRaisesRegexp(ValueError, str(type(obj))):
                merge(df, obj, left_on='a', right_on='a')

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on='C')
        del expected['C']

        join_col = self.target.pop('C')
        result = self.target.join(self.source, on=join_col)
        assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on='C')
        for col in self.source:
            self.assertIn(col, merged)
            self.assertTrue(merged[col].isnull().all())

        merged2 = self.target.join(self.source.reindex([]),
                                   on='C',
                                   how='inner')
        self.assert_index_equal(merged2.columns, merged.columns)
        self.assertEqual(len(merged2), 0)

    def test_join_on_inner(self):
        df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])

        joined = df.join(df2, on='key', how='inner')

        expected = df.join(df2, on='key')
        expected = expected[expected['value'].notnull()]
        self.assert_series_equal(joined['key'],
                                 expected['key'],
                                 check_dtype=False)
        self.assert_series_equal(joined['value'],
                                 expected['value'],
                                 check_dtype=False)
        self.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])

        # corner cases
        joined = df.join(df2, on=['key'])
        expected = df.join(df2, on='key')

        assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source['MergedA'], on='C')
        expected = self.target.join(self.source[['MergedA']], on='C')
        assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({'a': [1, 1]})
        ds = Series([2], index=[1], name='b')
        result = df.join(ds, on='a')
        expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self):
        df1 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(10),
                        columns=['A', 'B', 'C', 'D'])
        self.assertEqual(df1['B'].dtype, np.int64)
        self.assertEqual(df1['D'].dtype, np.bool_)

        df2 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(0, 10, 2),
                        columns=['A', 'B', 'C', 'D'])

        # overlap
        joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
        expected_columns = [
            'A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two',
            'D_two'
        ]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        assert_frame_equal(joined, expected)

        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1['bool'] = True
        df1['string'] = 'foo'

        df2 = DataFrame(index=np.arange(5, 15))
        df2['int'] = 1
        df2['float'] = 1.

        for kind in ['inner', 'outer', 'left', 'right']:

            joined = df1.join(df2, how=kind)
            expected = _join_by_hand(df1, df2, how=kind)
            assert_frame_equal(joined, expected)

            joined = df2.join(df1, how=kind)
            expected = _join_by_hand(df2, df1, how=kind)
            assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=['A']), how='outer')

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(randn(30, 2), columns=['a', 'b'])
        c = Series(randn(30))
        a['c'] = c
        d = DataFrame(randn(30, 1), columns=['q'])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays(
            [['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        index2 = MultiIndex.from_arrays(
            [['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6),
                        index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6),
                        index=index2,
                        columns=['var Y'])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how='outer')
        ex_index = index1._tuple_index.union(index2._tuple_index)
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        self.assertEqual(joined.index.names, index1.names)

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how='outer').sort_index(level=0)
        ex_index = index1._tuple_index.union(index2._tuple_index)
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        self.assertEqual(joined.index.names, index1.names)

    def test_join_inner_multiindex(self):
        key1 = [
            'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux',
            'snap'
        ]
        key2 = [
            'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three',
            'one'
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({'key1': key1, 'key2': key2, 'data': data})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=['j_one', 'j_two', 'j_three'])

        joined = data.join(to_join, on=['key1', 'key2'], how='inner')
        expected = merge(data,
                         to_join.reset_index(),
                         left_on=['key1', 'key2'],
                         right_on=['first', 'second'],
                         how='inner',
                         sort=False)

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)

        expected = expected.drop(['first', 'second'], axis=1)
        expected.index = joined.index

        self.assertTrue(joined.index.is_monotonic)
        assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
        new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
        other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
        other_df.set_index('a', inplace=True)
        # GH 9455, 12219
        with tm.assert_produces_warning(UserWarning):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        self.assertTrue(('b', 'mean') in result)
        self.assertTrue('b' in result)

    def test_join_float64_float32(self):

        a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
        b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
        joined = a.join(b)
        self.assertEqual(joined.dtypes['a'], 'float64')
        self.assertEqual(joined.dtypes['b'], 'float64')
        self.assertEqual(joined.dtypes['c'], 'float32')

        a = np.random.randint(0, 5, 100).astype('int64')
        b = np.random.random(100).astype('float64')
        c = np.random.random(100).astype('float32')
        df = DataFrame({'a': a, 'b': b, 'c': c})
        xpdf = DataFrame({'a': a, 'b': b, 'c': c})
        s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
        rs = df.merge(s, left_on='a', right_index=True)
        self.assertEqual(rs.dtypes['a'], 'int64')
        self.assertEqual(rs.dtypes['b'], 'float64')
        self.assertEqual(rs.dtypes['c'], 'float32')
        self.assertEqual(rs.dtypes['md'], 'float32')

        xp = xpdf.merge(s, left_on='a', right_index=True)
        assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({
            "a": [1, 1, 1],
            "b": [1, 1, 2],
            "e": [1000, 2000, 3000]
        })
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.random.randn(8)
        })
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2),
                   name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({
            'key': ['foo', 'bar', 'baz', 'foo'],
            'value': [1, 2, 3, 4]
        })
        right = DataFrame({'value2': ['a', 'b', 'c']},
                          index=['bar', 'baz', 'foo'])

        joined = left.join(right, on='key', sort=True)
        expected = DataFrame(
            {
                'key': ['bar', 'baz', 'foo', 'foo'],
                'value': [2, 3, 1, 4],
                'value2': ['a', 'b', 'c', 'c']
            },
            index=[1, 2, 0, 3])
        assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on='key', sort=False)
        self.assert_index_equal(joined.index, pd.Index(lrange(4)))

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
        df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame(
            {
                'a': [1, 2, 3, 3, 4],
                'b': [5, np.nan, 6, 7, np.nan]
            },
            index=[1, 2, 3, 3, 'a'])
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
        df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({
            'a': [1, 2, 3, 4],
            'b': [5, 6, 6, np.nan]
        },
                             index=[1, 2, 2, 'a'])
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=['a', 'b', 'c', 'd', 'e', 'f'])
        df.insert(0, 'id', 0)
        df.insert(5, 'dt', 'foo')

        grouped = df.groupby('id')
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix='_right')

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
        df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [
            df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9]
        ]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how='outer')
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how='inner')
        _check_diff_index(df_list, joined, df.index[2:8])

        self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a')

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
        df['key'] = ['foo', 'bar'] * 4
        df1 = df.loc[:, ['A', 'B']]
        df2 = df.loc[:, ['C', 'D']]
        df3 = df.loc[:, ['key']]

        result = df1.join([df2, df3])
        assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat([
            DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']),
            DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
                      columns=['A', 'C'])
        ],
                    axis=1)

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix='_2')
        result.columns = expected.columns
        assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True,
                      right_index=True).merge(z,
                                              left_index=True,
                                              right_index=True,
                                              how="outer")
        dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = [
            'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'
        ]
        assert_frame_equal(dta, expected)

    def test_panel_join(self):
        panel = tm.makePanel()
        tm.add_nans(panel)

        p1 = panel.iloc[:2, :10, :3]
        p2 = panel.iloc[2:, 5:, 2:]

        # left join
        result = p1.join(p2)
        expected = p1.copy()
        expected['ItemC'] = p2['ItemC']
        tm.assert_panel_equal(result, expected)

        # right join
        result = p1.join(p2, how='right')
        expected = p2.copy()
        expected['ItemA'] = p1['ItemA']
        expected['ItemB'] = p1['ItemB']
        expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC'])
        tm.assert_panel_equal(result, expected)

        # inner join
        result = p1.join(p2, how='inner')
        expected = panel.iloc[:, 5:10, 2:3]
        tm.assert_panel_equal(result, expected)

        # outer join
        result = p1.join(p2, how='outer')
        expected = p1.reindex(major=panel.major_axis, minor=panel.minor_axis)
        expected = expected.join(
            p2.reindex(major=panel.major_axis, minor=panel.minor_axis))
        tm.assert_panel_equal(result, expected)

    def test_panel_join_overlap(self):
        panel = tm.makePanel()
        tm.add_nans(panel)

        p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']]
        p2 = panel.loc[['ItemB', 'ItemC']]

        # Expected index is
        #
        # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2
        joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2')
        p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1')
        p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2')
        no_overlap = panel.loc[['ItemA']]
        expected = no_overlap.join(p1_suf.join(p2_suf))
        tm.assert_panel_equal(joined, expected)

    def test_panel_join_many(self):
        tm.K = 10
        panel = tm.makePanel()
        tm.K = 4

        panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]]

        joined = panels[0].join(panels[1:])
        tm.assert_panel_equal(joined, panel)

        panels = [
            panel.iloc[:2, :-5], panel.iloc[2:6, 2:], panel.iloc[6:, 5:-7]
        ]

        data_dict = {}
        for p in panels:
            data_dict.update(p.iteritems())

        joined = panels[0].join(panels[1:], how='inner')
        expected = pd.Panel.from_dict(data_dict, intersect=True)
        tm.assert_panel_equal(joined, expected)

        joined = panels[0].join(panels[1:], how='outer')
        expected = pd.Panel.from_dict(data_dict, intersect=False)
        tm.assert_panel_equal(joined, expected)

        # edge cases
        self.assertRaises(ValueError,
                          panels[0].join,
                          panels[1:],
                          how='outer',
                          lsuffix='foo',
                          rsuffix='bar')
        self.assertRaises(ValueError, panels[0].join, panels[1:], how='right')

Example #43

0

Show file

runtime_yf.reset_index(inplace=True)
runtime_yf = runtime_yf.rename(columns = {'index':'number of stocks'})
runtime_yf['number of stocks'] += 1


#runtimes using csv files

#For this project, we assume that the data is in
#the same directory as the .py file.

results = []
for i in djia:
    j = djia.index(i)
    startTime = perf_counter()
    filename = "data/"+i + ".csv"
    df = pd.read_csv(filename, encoding='utf-8')
    endTime = perf_counter()
    csv = (endTime - startTime)
    if j > 1:
        csv = csv + results[(j - 1)]
    results.append(csv)

runtime_csv = DataFrame(results, columns=['runtime'])
runtime_csv.reset_index(inplace=True)
runtime_csv = runtime_csv.rename(columns = {'index':'number of stocks'})
runtime_csv['number of stocks'] += 1

runtimes = []
runtimes = runtime_yf.join(runtime_csv, lsuffix='_yf', rsuffix='_csv')
runtimes = runtimes.rename(columns ={'number of stocks_yf':'number of stocks'})
runtimes.drop(columns=['number of stocks_csv'])

Example #44

0

Show file

File: project_draft.py Project: afcarl/TS_Project

invalid_times = ['09:31:00', '09:32:00', '09:33:00', '09:34:00']

for i in range(len(gdata)):
    if str(gdata.index[i])[-8:] in invalid_times:
        print "Dropping row at index " + str(
            gdata.index[i]) + ' at ' + time.ctime()
        gdata.drop(gdata.index[i], inplace=True)
'''
Index and join generated image data to clean financial data
===========================================================

After getting correct DTI in place, inner join the two DFs on the index
'''

test = clean.join(gdata, how='inner')
'''
Generate target data for model training
=======================================

NB: targets being generated from forward data means we will lose a few
train / test examples on the near-term end of the time series
'''

# stupidly simple binary loop; flexible to whatever is specified in mins_ahead:
ahead = []

for i in range(len(clean) - max(mins_ahead)):
    current_row = [
        1 if clean.iloc[i + mins_ahead[j], 0] > clean.iloc[i, 0] else 0
        for j in range(len(mins_ahead))

Example #45

0

Show file

class TestJoin:
    def setup_method(self, method):
        # aggregate multiple columns
        self.df = DataFrame({
            "key1": get_test_data(),
            "key2": get_test_data(),
            "data1": np.random.randn(N),
            "data2": np.random.randn(N),
        })

        # exclude a couple keys for fun
        self.df = self.df[self.df["key2"] > 1]

        self.df2 = DataFrame({
            "key1":
            get_test_data(n=N // 5),
            "key2":
            get_test_data(ngroups=NGROUPS // 2, n=N // 5),
            "value":
            np.random.randn(N // 5),
        })

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({
            "MergedA": data["A"],
            "MergedD": data["D"]
        },
                                index=data["C"])

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="left")

        joined_both = merge(self.df, self.df2)
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="left")

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2", how="right")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="right")

        joined_both = merge(self.df, self.df2, how="right")
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="right")

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2", how="outer")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="outer")

        joined_both = merge(self.df, self.df2, how="outer")
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="outer")

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2", how="inner")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="inner")

        joined_both = merge(self.df, self.df2, how="inner")
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="inner")

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar"))

        assert "key1.foo" in joined
        assert "key1.bar" in joined

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(
            self.df,
            self.df2,
            left_on="key2",
            right_on="key1",
            suffixes=(".foo", ".bar"),
        )
        assert "key1.foo" in joined
        assert "key2.bar" in joined

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on="C")
        tm.assert_series_equal(merged["MergedA"],
                               target["A"],
                               check_names=False)
        tm.assert_series_equal(merged["MergedD"],
                               target["D"],
                               check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
        joined = df.join(df2, on="key")
        expected = DataFrame({
            "key": ["a", "a", "b", "b", "c"],
            "value": [0, 0, 1, 1, 2]
        })
        tm.assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]],
                         index=["a", "b", "c"],
                         columns=["one"])
        df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"])
        joined = df_a.join(df_b, on="one")
        joined = joined.join(df_c, on="one")
        assert np.isnan(joined["two"]["c"])
        assert np.isnan(joined["three"]["c"])

        # merge column not p resent
        with pytest.raises(KeyError, match="^'E'$"):
            target.join(source, on="E")

        # overlap
        source_copy = source.copy()
        source_copy["A"] = 0
        msg = ("You are trying to merge on float64 and object columns. If "
               "you wish to proceed you should use pd.concat")
        with pytest.raises(ValueError, match=msg):
            target.join(source_copy, on="A")

    def test_join_on_fails_with_different_right_index(self):
        df = DataFrame({
            "a": np.random.choice(["m", "f"], size=3),
            "b": np.random.randn(3)
        })
        df2 = DataFrame(
            {
                "a": np.random.choice(["m", "f"], size=10),
                "b": np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2),
        )
        msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, left_on="a", right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        df = DataFrame(
            {
                "a": np.random.choice(["m", "f"], size=3),
                "b": np.random.randn(3)
            },
            index=tm.makeCustomIndex(3, 2),
        )
        df2 = DataFrame({
            "a": np.random.choice(["m", "f"], size=10),
            "b": np.random.randn(10)
        })
        msg = r'len\(right_on\) must equal the number of levels in the index of "left"'
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on="b", left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        df = DataFrame({
            "a": np.random.choice(["m", "f"], size=3),
            "b": np.random.randn(3)
        })
        df2 = DataFrame(
            {
                "a": np.random.choice(["m", "f"], size=10),
                "b": np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2),
        )
        msg = r"len\(right_on\) must equal len\(left_on\)"
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on="a", left_on=["a", "b"])

    @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])])
    def test_join_on_fails_with_wrong_object_type(self, wrong_type):
        # GH12081 - original issue

        # GH21220 - merging of Series and DataFrame is now allowed
        # Edited test to remove the Series object from test parameters

        df = DataFrame({"a": [1, 1]})
        msg = ("Can only merge Series or DataFrame objects, "
               f"a {type(wrong_type)} was passed")
        with pytest.raises(TypeError, match=msg):
            merge(wrong_type, df, left_on="a", right_on="a")
        with pytest.raises(TypeError, match=msg):
            merge(df, wrong_type, left_on="a", right_on="a")

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on="C")
        del expected["C"]

        join_col = self.target.pop("C")
        result = self.target.join(self.source, on=join_col)
        tm.assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on="C")
        for col in self.source:
            assert col in merged
            assert merged[col].isna().all()

        merged2 = self.target.join(self.source.reindex([]),
                                   on="C",
                                   how="inner")
        tm.assert_index_equal(merged2.columns, merged.columns)
        assert len(merged2) == 0

    def test_join_on_inner(self):
        df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1]}, index=["a", "b"])

        joined = df.join(df2, on="key", how="inner")

        expected = df.join(df2, on="key")
        expected = expected[expected["value"].notna()]
        tm.assert_series_equal(joined["key"], expected["key"])
        tm.assert_series_equal(joined["value"],
                               expected["value"],
                               check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])

        # corner cases
        joined = df.join(df2, on=["key"])
        expected = df.join(df2, on="key")

        tm.assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source["MergedA"], on="C")
        expected = self.target.join(self.source[["MergedA"]], on="C")
        tm.assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({"a": [1, 1]})
        ds = Series([2], index=[1], name="b")
        result = df.join(ds, on="a")
        expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1["bool"] = True
        df1["string"] = "foo"

        df2 = DataFrame(index=np.arange(5, 15))
        df2["int"] = 1
        df2["float"] = 1.0

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        tm.assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        tm.assert_frame_equal(joined, expected)

    def test_join_index_mixed_overlap(self):
        df1 = DataFrame(
            {
                "A": 1.0,
                "B": 2,
                "C": "foo",
                "D": True
            },
            index=np.arange(10),
            columns=["A", "B", "C", "D"],
        )
        assert df1["B"].dtype == np.int64
        assert df1["D"].dtype == np.bool_

        df2 = DataFrame(
            {
                "A": 1.0,
                "B": 2,
                "C": "foo",
                "D": True
            },
            index=np.arange(0, 10, 2),
            columns=["A", "B", "C", "D"],
        )

        # overlap
        joined = df1.join(df2, lsuffix="_one", rsuffix="_two")
        expected_columns = [
            "A_one",
            "B_one",
            "C_one",
            "D_one",
            "A_two",
            "B_two",
            "C_two",
            "D_two",
        ]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        tm.assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=["A"]), how="outer")

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(np.random.randn(30, 2), columns=["a", "b"])
        c = Series(np.random.randn(30))
        a["c"] = c
        d = DataFrame(np.random.randn(30, 1), columns=["q"])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays(
            [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]],
            names=["first", "second"],
        )

        index2 = MultiIndex.from_arrays(
            [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]],
            names=["first", "second"],
        )

        df1 = DataFrame(data=np.random.randn(6),
                        index=index1,
                        columns=["var X"])
        df2 = DataFrame(data=np.random.randn(6),
                        index=index2,
                        columns=["var Y"])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how="outer")
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        tm.assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how="outer").sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        tm.assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

    def test_join_inner_multiindex(self):
        key1 = [
            "bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux",
            "snap"
        ]
        key2 = [
            "two",
            "one",
            "three",
            "one",
            "two",
            "one",
            "two",
            "two",
            "three",
            "one",
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({"key1": key1, "key2": key2, "data": data})

        index = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=["j_one", "j_two", "j_three"])

        joined = data.join(to_join, on=["key1", "key2"], how="inner")
        expected = merge(
            data,
            to_join.reset_index(),
            left_on=["key1", "key2"],
            right_on=["first", "second"],
            how="inner",
            sort=False,
        )

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )
        tm.assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )

        expected = expected.drop(["first", "second"], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic
        tm.assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
        new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]})
        other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
        other_df.set_index("a", inplace=True)
        # GH 9455, 12219
        with tm.assert_produces_warning(FutureWarning):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        assert ("b", "mean") in result
        assert "b" in result

    def test_join_float64_float32(self):

        a = DataFrame(np.random.randn(10, 2),
                      columns=["a", "b"],
                      dtype=np.float64)
        b = DataFrame(np.random.randn(10, 1), columns=["c"], dtype=np.float32)
        joined = a.join(b)
        assert joined.dtypes["a"] == "float64"
        assert joined.dtypes["b"] == "float64"
        assert joined.dtypes["c"] == "float32"

        a = np.random.randint(0, 5, 100).astype("int64")
        b = np.random.random(100).astype("float64")
        c = np.random.random(100).astype("float32")
        df = DataFrame({"a": a, "b": b, "c": c})
        xpdf = DataFrame({"a": a, "b": b, "c": c})
        s = DataFrame(np.random.random(5).astype("float32"), columns=["md"])
        rs = df.merge(s, left_on="a", right_index=True)
        assert rs.dtypes["a"] == "int64"
        assert rs.dtypes["b"] == "float64"
        assert rs.dtypes["c"] == "float32"
        assert rs.dtypes["md"] == "float32"

        xp = xpdf.merge(s, left_on="a", right_index=True)
        tm.assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how="outer")

        df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer")
        expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer")

        result = result.reset_index()
        expected = expected[result.columns]
        expected["a"] = expected.a.astype("int64")
        expected["b"] = expected.b.astype("int64")
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({
            "a": [1, 1, 1],
            "b": [1, 1, 2],
            "e": [1000, 2000, 3000]
        })
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how="inner")

        df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner")
        expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner")

        result = result.reset_index()

        tm.assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({
            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
            "C":
            np.random.randn(8),
            "D":
            np.random.randn(8),
        })
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2),
                   name="TEST")
        inner = df.join(s, how="inner")
        outer = df.join(s, how="outer")
        left = df.join(s, how="left")
        right = df.join(s, how="right")
        tm.assert_frame_equal(inner, outer)
        tm.assert_frame_equal(inner, left)
        tm.assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({
            "key": ["foo", "bar", "baz", "foo"],
            "value": [1, 2, 3, 4]
        })
        right = DataFrame({"value2": ["a", "b", "c"]},
                          index=["bar", "baz", "foo"])

        joined = left.join(right, on="key", sort=True)
        expected = DataFrame(
            {
                "key": ["bar", "baz", "foo", "foo"],
                "value": [2, 3, 1, 4],
                "value2": ["a", "b", "c", "c"],
            },
            index=[1, 2, 0, 3],
        )
        tm.assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on="key", sort=False)
        tm.assert_index_equal(joined.index, Index(range(4)), exact=True)

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"])
        df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame(
            {
                "a": [1, 2, 3, 3, 4],
                "b": [5, np.nan, 6, 7, np.nan]
            },
            index=[1, 2, 3, 3, "a"],
        )
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"])
        df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({
            "a": [1, 2, 3, 4],
            "b": [5, 6, 6, np.nan]
        },
                             index=[1, 2, 2, "a"])
        tm.assert_frame_equal(result, expected)

    def test_join_non_unique_period_index(self):
        # GH #16871
        index = pd.period_range("2016-01-01", periods=16, freq="M")
        df = DataFrame(list(range(len(index))), index=index, columns=["pnum"])
        df2 = concat([df, df])
        result = df.join(df2, how="inner", rsuffix="_df2")
        expected = DataFrame(
            np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
            columns=["pnum", "pnum_df2"],
            index=df2.sort_index().index,
        )
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=["a", "b", "c", "d", "e", "f"])
        df.insert(0, "id", 0)
        df.insert(5, "dt", "foo")

        grouped = df.groupby("id")
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix="_right")

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list("abcdef"))
        df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [
            df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]
        ]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how="outer")
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how="inner")
        _check_diff_index(df_list, joined, df.index[2:8])

        msg = "Joining multiple DataFrames only supported for joining on index"
        with pytest.raises(ValueError, match=msg):
            df_list[0].join(df_list[1:], on="a")

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"])
        df["key"] = ["foo", "bar"] * 4
        df1 = df.loc[:, ["A", "B"]]
        df2 = df.loc[:, ["C", "D"]]
        df3 = df.loc[:, ["key"]]

        result = df1.join([df2, df3])
        tm.assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat(
            [
                DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"
                                                           ]),
                DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
                          columns=["A", "C"]),
            ],
            axis=1,
        )

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix="_2")
        result.columns = expected.columns
        tm.assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True,
                      right_index=True).merge(z,
                                              left_index=True,
                                              right_index=True,
                                              how="outer")
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = [
            "x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"
        ]
        tm.assert_frame_equal(dta, expected)

    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product(
            [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"])
        left = DataFrame({"v1": range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product(
            [list("abc"), list("xy")], names=["abc", "xy"])
        right = DataFrame({"v2": [100 * i for i in range(1, 7)]},
                          index=rightindex)

        result = left.join(right, on=["abc", "xy"], how=join_type)
        expected = (left.reset_index().merge(right.reset_index(),
                                             on=["abc", "xy"],
                                             how=join_type).set_index(
                                                 ["abc", "xy", "num"]))
        tm.assert_frame_equal(expected, result)

        msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
        with pytest.raises(ValueError, match=msg):
            left.join(right, on="xy", how=join_type)

        with pytest.raises(ValueError, match=msg):
            right.join(left, on=["abc", "xy"], how=join_type)

    def test_join_on_tz_aware_datetimeindex(self):
        # GH 23931, 26335
        df1 = DataFrame({
            "date":
            pd.date_range(start="2018-01-01", periods=5, tz="America/Chicago"),
            "vals":
            list("abcde"),
        })

        df2 = DataFrame({
            "date":
            pd.date_range(start="2018-01-03", periods=5, tz="America/Chicago"),
            "vals_2":
            list("tuvwx"),
        })
        result = df1.join(df2.set_index("date"), on="date")
        expected = df1.copy()
        expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object)
        tm.assert_frame_equal(result, expected)

    def test_join_datetime_string(self):
        # GH 5647
        dfa = DataFrame(
            [
                ["2012-08-02", "L", 10],
                ["2012-08-02", "J", 15],
                ["2013-04-06", "L", 20],
                ["2013-04-06", "J", 25],
            ],
            columns=["x", "y", "a"],
        )
        dfa["x"] = pd.to_datetime(dfa["x"])
        dfb = DataFrame(
            [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]],
            columns=["x", "y", "z"],
            index=[2, 4],
        )
        dfb["x"] = pd.to_datetime(dfb["x"])
        result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"])
        expected = DataFrame(
            [
                [Timestamp("2012-08-02 00:00:00"), "J", 1, 15],
                [Timestamp("2013-04-06 00:00:00"), "L", 2, 20],
            ],
            index=[2, 4],
            columns=["x", "y", "z", "a"],
        )
        tm.assert_frame_equal(result, expected)

Example #46

0

Show file

File: Weathercondition_ANN_longterm.py Project: sunfeifeier/Classfication-ANN-model

df = read_csv("Phoneix_Finalclean.csv")
aa=df.Conditions_Name.value_counts()
ax=aa.plot(x='Conditions_Name', y='Amount',kind='bar',color="blue", figsize=(15,8),fontsize=16)
#plt.title("Twelve years' weather condition summary",size=30)
#ax.set_title("2004-2016 Phoenix weather condition summary",size=30)
ax.set_xlabel('Weather Condition',size=20) 
ax.set_ylabel('Total Amount/hour',size=20)                    
plt.show()


scaler = MinMaxScaler(feature_range=(0, 1))
scaled_d =df.loc[:,["Sea_Level_PressureIn_N","Humidity_N","Dew_PointF_N","Wind_Speed_mps","Temperature_C_N"]]
scaled = scaler.fit_transform(scaled_d)
scaled = DataFrame(scaled)
scaled.columns = ["Sea_Level_PressureIn_N","Humidity_N","Dew_PointF_N","Wind_Speed_mps","Temperature_C_N"]
x = scaled.join(df.loc[:,["Hour","Conditions_Name"]])
x = x.loc[x['Conditions_Name'].isin(["Clear","Mostly Cloudy","Partly Cloudy","Scattered Clouds",'Overcast'])]
x = x.dropna()
x.isnull().sum()
count = x.Conditions_Name.value_counts()
print(count)

def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data
x = encode(x, 'Hour', 23)
x=x.drop(["Hour"],axis=1)
ax = x.plot.scatter('Hour_sin', 'Hour_cos').set_aspect('equal')
"""
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):

Example #47

0

Show file

File: train_classifier.py Project: ESA-PhiLab/hypernet

def train_and_eval_classifier(dataframe: pd.DataFrame,
                              label_name: str,
                              train_fraction: float,
                              model_name: str,
                              seed: int,
                              verbose: int = 1,
                              n_jobs: int = 4) -> Tuple[pd.DataFrame, Dict]:
    """
    Train and evaluate the classifier given dataset as a dataframe.
    The dataset is a design matrix, where in rows each new observations
    are placed, and the columns denote the explanatory variables.
    The process of finding the best parameters is done by leave one
    out cross validation method, utilizing the accuracy score.

    :param dataframe: Data collected for the classification problem.
    :param label_name: Name of the label i.e., the dependent variable.
    :param train_fraction: Fraction of samples for
        each class for stratified sampling.
    :param model_name: Name of the utilized model.
    :param seed: Seed used for reproduction of the experiment results.
    :param verbose: Verbosity mode.
    :param n_jobs: Number of jobs utilized for the parallel computing.
    :return: Tuple of the report over the test set as a dataframe
        and the best parameters found as a dictionary.
    """
    dataframe = dataframe.join(
        pd.get_dummies(dataframe[label_name], prefix='class'))
    class_names = [col_name for col_name in dataframe
                   if col_name.startswith('class')]

    train = dataframe.groupby('label', group_keys=False).apply(
        lambda class_group: class_group.sample(
            n=ceil(train_fraction * len(class_group)),
            random_state=seed)).drop(columns=label_name)

    test = dataframe.drop(train.index).drop(columns=label_name)

    X_train, y_train, X_test, y_test = \
        train.drop(columns=class_names), train[class_names], \
        test.drop(columns=class_names), test[class_names]

    model = GridSearchCV(
        estimator=ML_MODELS[model_name](random_state=seed),
        param_grid=ML_MODELS_GRID[model_name],
        cv=LeaveOneOut().split(X_train, y_train),
        scoring=make_scorer(accuracy_score),
        verbose=verbose,
        n_jobs=n_jobs,
        refit=True).fit(X_train, y_train)

    y_test_pred = model.predict(X_test)

    y_test_true_argmax = y_test.values.argmax(axis=1)
    y_test_pred_argmax = y_test_pred.argmax(axis=1)
    class_names = {class_name: i for i, class_name in enumerate(list(y_test))}

    test_report = pd.DataFrame(
        confusion_matrix(y_true=y_test_true_argmax,
                         y_pred=y_test_pred_argmax,
                         labels=list(class_names.values())),
        index=['true_' + class_name for class_name in class_names.keys()],
        columns=['pred_' + class_name for class_name in class_names.keys()])

    placeholder = [None for _ in range(len(class_names) - 1)]
    test_report['test_oa_acc'] = [accuracy_score(
        y_true=y_test_true_argmax, y_pred=y_test_pred_argmax)] + placeholder
    test_report['test_avg_acc'] = [balanced_accuracy_score(
        y_true=y_test_true_argmax, y_pred=y_test_pred_argmax)] + placeholder
    return test_report, model.best_params_

Example #48

0

Show file

    def filtered_summaries(
        self,
        start_time,
        end_time,
        interval,
        filter_expression,
        summary_types,
        calculation_basis=None,
        filter_evaluation=None,
        filter_interval=None,
        time_type=None,
    ):
        """filtered_summaries

        Return one or more summary values for each interval within a time range

        Args:
            start_time (str): String containing the date, and possibly time,
                from which to retrieve the values. This is parsed, together
                with `end_time`, using
                :afsdk:`AF.Time.AFTimeRange <M_OSIsoft_AF_Time_AFTimeRange__ctor_1.htm>`.
            end_time (str): String containing the date, and possibly time,
                until which to retrieve values. This is parsed, together
                with `start_time`, using
                :afsdk:`AF.Time.AFTimeRange <M_OSIsoft_AF_Time_AFTimeRange__ctor_1.htm>`.
            interval (str): String containing the interval at which to extract
                data. This is parsed using
                :afsdk:`AF.Time.AFTimeSpan.Parse <M_OSIsoft_AF_Time_AFTimeSpan_Parse_1.htm>`.
            filter_expression (str, optional): Defaults to ''. Query on which
                data to include in the results. See :ref:`filtering_values`
                for more information on filter queries.
            summary_types (int or PIConsts.SummaryType): Type(s) of summaries
                of the data within the requested time range.
            calculation_basis (int or PIConsts.CalculationBasis, optional):
                Event weighting within an interval. See :ref:`event_weighting`
                and :any:`CalculationBasis` for more information. Defaults to
                CalculationBasis.TIME_WEIGHTED.
            filter_evaluation (int or PIConsts,ExpressionSampleType, optional):
                Determines whether the filter is applied to the raw events in
                the database, of if it is applied to an interpolated series
                with a regular interval. Defaults to
                ExpressionSampleType.EXPRESSION_RECORDED_VALUES.
            filter_interval (str, optional): String containing the interval at
                which to extract apply the filter. This is parsed using
                :afsdk:`AF.Time.AFTimeSpan.Parse <M_OSIsoft_AF_Time_AFTimeSpan_Parse_1.htm>`.
            time_type (int or PIConsts.TimestampCalculation, optional):
                Timestamp to return for each of the requested summaries. See
                :ref:`summary_timestamps` and :any:`TimestampCalculation` for
                more information. Defaults to TimestampCalculation.AUTO.

        Returns:
            pandas.DataFrame: Dataframe with the unique timestamps as row index
                and the summary name as column name.
        """
        time_range = AF.Time.AFTimeRange(start_time, end_time)
        interval = AF.Time.AFTimeSpan.Parse(interval)
        filter_expression = self._normalize_filter_expression(
            filter_expression)
        calculation_basis = get_enumerated_value(
            enumeration=CalculationBasis,
            value=calculation_basis,
            default=CalculationBasis.TIME_WEIGHTED,
        )
        filter_evaluation = get_enumerated_value(
            enumeration=ExpressionSampleType,
            value=filter_evaluation,
            default=ExpressionSampleType.EXPRESSION_RECORDED_VALUES,
        )
        time_type = get_enumerated_value(
            enumeration=TimestampCalculation,
            value=time_type,
            default=TimestampCalculation.AUTO,
        )
        filter_interval = AF.Time.AFTimeSpan.Parse(filter_interval)
        pivalues = self._filtered_summaries(
            time_range,
            interval,
            filter_expression,
            summary_types,
            calculation_basis,
            filter_evaluation,
            filter_interval,
            time_type,
        )
        df = DataFrame()
        for summary in pivalues:
            key = SummaryType(summary.Key).name
            timestamps, values = zip(
                *[(PISeries.timestamp_to_index(value.Timestamp.UtcTime),
                   value.Value) for value in summary.Value])
            df = df.join(DataFrame(data={key: values}, index=timestamps),
                         how="outer")
        return df

Example #49

0

Show file

i = [t in dfs1[2].时间.values for t in dfs1[0].时间.values]
dfs1[0] = dfs1[0][i]
dfs1[1] = dfs1[1][i]

i = [t in dfs1[0].时间.values for t in dfs1[2].时间.values]
for j in range(2, 15):
    dfs1[j] = dfs1[j][i]

for j in range(len(dfs1)):
    dfs1[j] = dfs1[j].set_index("时间")

#dfs1[0] = dfs1[0][t in dfs1[2].时间.values for t in dfs1[0].时间.values]
tr_data = dfs1[0].iloc[:, 0].apply(float)
tr_data = DataFrame(tr_data)
for i in range(1, len(dfs1)):
    tr_data = tr_data.join(dfs1[i].iloc[:, 0].apply(float))

tr_data1 = dfs1[0].iloc[:, 1].apply(float)
tr_data1 = DataFrame(tr_data1)
for i in range(1, len(dfs1)):
    tr_data1 = tr_data1.join(dfs1[i].iloc[:, 1].apply(float))

tr_data2 = dfs1[0].iloc[:, 2].apply(float)
tr_data2 = DataFrame(tr_data2)
for i in range(1, len(dfs1)):
    tr_data2 = tr_data2.join(dfs1[i].iloc[:, 2].apply(float))

corMat = DataFrame(tr_data2.corr())
plot.pcolor(corMat)
plot.show()

Example #50

0

Show file

File: data.py Project: luispedraza/gasole

    def get_forward_data(self, months, call=True, put=False):
        """
        Gets either call, put, or both data for months starting in the current
        month and going out in the future a spcified amount of time.

        Parameters
        ----------
        months: number, int
            How many months to go out in the collection of the data. This is
            inclusive.

        call: bool, optional (default=True)
            Whether or not to collect data for call options

        put: bool, optional (default=False)
            Whether or not to collect data for put options.

        Returns
        -------
        all_calls: DataFrame
            If asked for, a DataFrame containing call data from the current
            month to the current month plus months.

        all_puts: DataFrame
            If asked for, a DataFrame containing put data from the current
            month to the current month plus months.
        """
        in_months = range(cur_month, cur_month + months + 1)
        in_years = [cur_year] * months

        # Figure out how many items in in_months go past 12
        to_change = 0
        for i in range(months):
            if in_months[i] > 12:
                in_months[i] -= 12
                to_change += 1

        # Change the corresponding items in the in_years list.
        for i in range(1, to_change + 1):
            in_years[-i] += 1

        if call:
            all_calls = DataFrame()
            for mon in range(months):
                try:  # This catches cases when there isn't data for a month
                    call_frame = self.get_call_data(in_months[mon],
                                                    in_years[mon])
                    tick = str(call_frame.ix[0, 1])
                    start = len(self.symbol)
                    year = tick[start:start + 2]
                    month = tick[start + 2:start + 4]
                    day = tick[start + 4:start + 6]
                    expiry = str(month + '-' + day + '-' + year)
                    call_frame['Expiry'] = expiry
                    if mon == 0:
                        all_calls = all_calls.join(call_frame, how='right')
                    else:
                        all_calls = concat([all_calls, call_frame])
                except:
                    pass

        if put:
            all_puts = DataFrame()
            for mon in range(months):
                try:  # This catches cases when there isn't data for a month
                    put_frame = self.get_put_data(in_months[mon],
                                                  in_years[mon])

                    # Add column with expiry data to this frame.
                    tick = str(put_frame.ix[0, 1])
                    start = len(self.symbol)
                    year = tick[start:start + 2]
                    month = tick[start + 2:start + 4]
                    day = tick[start + 4:start + 6]
                    expiry = str(month + '-' + day + '-' + year)
                    put_frame['Expiry'] = expiry

                    if mon == 0:
                        all_puts = all_puts.join(put_frame, how='right')
                    else:
                        all_puts = concat([all_puts, put_frame])
                except:
                    pass

        if call and put:
            return [all_calls, all_puts]
        else:
            if call:
                return all_calls
            else:
                return all_puts

Example #51

0

Show file

File: daily.py Project: suryanuj/stock-comparisons

    def _read_one_data(self, url, params):
        """ read one data from specified symbol """

        symbol = params["symbol"]
        del params["symbol"]
        url = url.format(symbol)

        resp = self._get_response(url, params=params)
        ptrn = r"root\.App\.main = (.*?);\n}\(this\)\);"
        try:
            j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1))
            data = j["context"]["dispatcher"]["stores"]["HistoricalPriceStore"]
        except KeyError:
            msg = "No data fetched for symbol {} using {}"
            raise RemoteDataError(msg.format(symbol, self.__class__.__name__))

        # price data
        prices = DataFrame(data["prices"])
        prices.columns = [col.capitalize() for col in prices.columns]
        prices["Date"] = to_datetime(
            to_datetime(prices["Date"], unit="s").dt.date)

        if "Data" in prices.columns:
            prices = prices[prices["Data"].isnull()]
        prices = prices[[
            "Date", "High", "Low", "Open", "Close", "Volume", "Adjclose"
        ]]
        prices = prices.rename(columns={"Adjclose": "Adj Close"})

        prices = prices.set_index("Date")
        prices = prices.sort_index().dropna(how="all")

        if self.ret_index:
            prices["Ret_Index"] = _calc_return_index(prices["Adj Close"])
        if self.adjust_price:
            prices = _adjust_prices(prices)

        # dividends & splits data
        if self.get_actions and data["eventsData"]:

            actions = DataFrame(data["eventsData"])
            actions.columns = [col.capitalize() for col in actions.columns]
            actions["Date"] = to_datetime(
                to_datetime(actions["Date"], unit="s").dt.date)

            types = actions["Type"].unique()
            if "DIVIDEND" in types:
                divs = actions[actions.Type == "DIVIDEND"].copy()
                divs = divs[["Date", "Amount"]].reset_index(drop=True)
                divs = divs.set_index("Date")
                divs = divs.rename(columns={"Amount": "Dividends"})
                prices = prices.join(divs, how="outer")

            if "SPLIT" in types:

                def split_ratio(row):
                    if float(row["Numerator"]) > 0:
                        if ":" in row["Splitratio"]:
                            n, m = row["Splitratio"].split(':')
                            return float(m) / float(n)
                        else:
                            return eval(row["Splitratio"])
                    else:
                        return 1

                splits = actions[actions.Type == "SPLIT"].copy()
                splits["SplitRatio"] = splits.apply(split_ratio, axis=1)
                splits = splits.reset_index(drop=True)
                splits = splits.set_index("Date")
                splits["Splits"] = splits["SplitRatio"]
                prices = prices.join(splits["Splits"], how="outer")

                if "DIVIDEND" in types and not self.adjust_dividends:
                    # dividends are adjusted automatically by Yahoo
                    adj = (prices["Splits"].sort_index(
                        ascending=False).fillna(1).cumprod())
                    prices["Dividends"] = prices["Dividends"] / adj

        return prices

Example #52

0

Show file

File: path.py Project: daniel-code/Access_Control_Patterns_Detection

    def data_encoding(self, raw_data: pd.DataFrame, building_num: int,
                      gates_code_table: dict) -> np.array:
        """
        Encode raw record data from database

        :param gates_code_table: gate code table for mapping code
        :param building_num: total buildings
        :param raw_data: raw data from DataTable.get_raw_record_data()

        :return:
            - data_list : Feature of encode data
            - target_list : gate label of encode data
        """
        week_data = raw_data['datetime'].dt.weekday.rename('week')
        raw_data = raw_data.join(week_data)
        raw_data = raw_data.reset_index().drop(columns=['index'])
        data_list = pd.DataFrame()
        #############################
        # Feature Encoding          #
        #############################
        # gate one hot encoding
        gate_one_hot_list = np.arange(len(gates_code_table)).reshape(-1, 1)
        gate_encoder = OneHotEncoder()
        gate_encoder.fit(gate_one_hot_list)

        week_one_hot_list = np.arange(7).reshape(-1, 1)
        week_encoder = OneHotEncoder()
        week_encoder.fit(week_one_hot_list)

        building_one_hot_list = np.arange(1, building_num + 1).reshape(-1, 1)
        building_encoder = OneHotEncoder()
        building_encoder.fit(building_one_hot_list)

        gatecode = raw_data['building'].str.cat([raw_data['floor'], raw_data['IO']], sep='-').apply(
            lambda x: gates_code_table[x] if x in gates_code_table else 0).rename('gate').astype(int)
        raw_data['gate'] = gatecode
        raw_data['next_gate'] = gatecode.shift(-1)

        gatecode = raw_data['gate']
        gatecode_onehotcode = gate_encoder.transform(gatecode.values.reshape(-1, 1)).toarray()
        gatecode_onehotcode = pd.DataFrame(gatecode_onehotcode, dtype='int').add_prefix('gate_')

        # weekday one hot encoding
        weekdaycode = week_encoder.transform(raw_data['week'].values.reshape(-1, 1)).toarray()
        weekdaycode = pd.DataFrame(weekdaycode, dtype='int').add_prefix('weekday_')

        # building one hot encoding
        buildingcode = raw_data['building'].astype(int)
        buildingcode_onehotcode = building_encoder.transform(buildingcode.values.reshape(-1, 1)).toarray()
        buildingcode_onehotcode = pd.DataFrame(buildingcode_onehotcode, dtype='int').add_prefix('building_')

        # Time feature
        data_list['hour'] = raw_data['datetime'].apply(lambda x: x.hour / 24)
        data_list['minute'] = raw_data['datetime'].apply(lambda x: x.minute / 60)
        data_list['second'] = raw_data['datetime'].apply(lambda x: x.second / 60)

        # IO code
        IOcode = raw_data['IO'].apply(lambda x: convert_IOcode(x))
        # join feature
        data_list = data_list.join(other=[IOcode, weekdaycode, buildingcode_onehotcode, gatecode_onehotcode])
        # match order
        data_list = data_list.dropna(how='any')
        target_list = raw_data['next_gate']

        data_list = data_list.values
        target_list = target_list.values.flatten()
        return data_list, target_list

Example #53

0

Show file

 def test_join_empty_bug(self):
     # generated an exception in 0.4.3
     x = DataFrame()
     x.join(DataFrame([3], index=[0], columns=['A']), how='outer')

Example #54

0

Show file

File: encoders.py Project: rbilleci/pandora

 def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
     df_encoded = self.transform(df)
     return df.join(df_encoded)

Example #55

0

Show file

def align_srt_tshark_stats(stats: pd.DataFrame, rcv_tshark_csv: str):
    """
    Align SRT statistics and tshark data.

    Attributes:
        stats: 
            Aligned SRT statisitcs collected both at the receiver
            and sender sides, the output from align_srt_stats function.
        rcv_tshark_csv:
            Filepath to .csv thark data collected at the receiver side.
    """
    print('\nMerging tshark data with SRT statistics')

    # Extract SRT packets from .csv tshark dump file collected at the receiver side
    srt_packets = extract_srt_packets(rcv_tshark_csv)

    print('\nSRT packets extracted from receiver tshark dump')
    print(srt_packets.head(10))

    # Extract UMSG_ACK packets from SRT packets srt_packets that
    # contain receiving speed and bandwidth estimations reported by
    # receiver each 10 ms
    umsg_ack_packets = extract_umsg_ack_packets(srt_packets)

    print('\nUMSG_ACK packets extracted from SRT packets')
    print(umsg_ack_packets.head(10))

    # From umsg_ack_packets dataframe, extract features valuable 
    # for further analysis, do some data cleaning and timezone correction
    TSHARK_FEATURES = [
        'ws.no',
        'frame.time',
        'srt.rtt',
        'srt.rttvar',
        'srt.rate',
        'srt.bw',
        'srt.rcvrate'
    ]
    umsg_ack_packets = umsg_ack_packets[TSHARK_FEATURES]
    umsg_ack_packets = umsg_ack_packets.set_index('frame.time')
    umsg_ack_packets.index = umsg_ack_packets.index.tz_convert(None)
    umsg_ack_packets['srt.rtt'] = umsg_ack_packets['srt.rtt'] / 1000
    umsg_ack_packets['srt.rttvar'] = umsg_ack_packets['srt.rttvar'] / 1000
    umsg_ack_packets = umsg_ack_packets.rename(
        columns={
            'srt.rtt': 'srt.rtt.ms',
            'srt.rttvar': 'srt.rttvar.ms',
            'srt.rate': 'srt.rate.pkts',
            'srt.bw': 'srt.bw.pkts',
            'srt.rcvrate': 'srt.rate.Bps'
        }
    )
    umsg_ack_packets['srt.rate.Mbps'] = convert_bytesps_in_mbps(
        umsg_ack_packets['srt.rate.Bps']
    )
    umsg_ack_packets['srt.bw.Mbps'] = convert_bytesps_in_mbps(
        convert_pktsps_in_bytesps(umsg_ack_packets['srt.bw.pkts'])
    )
    umsg_ack_packets = umsg_ack_packets[
        [
            'ws.no',
            'srt.rtt.ms',
            'srt.rttvar.ms',
            'srt.rate.pkts',
            'srt.rate.Mbps',
            'srt.bw.pkts',
            'srt.bw.Mbps'
        ]
    ]

    print('\nAdjusted UMSG_ACK packets')
    print(umsg_ack_packets.head(10))
    print(umsg_ack_packets.tail(10))

    # Combine stats dataframe (with SRT statistics) and adjusted 
    # umsg_ack_packets dataframe. stats dataframe timepoints will be
    # further used as the timepoints for result dataframe
    start_timestamp = stats.index[0]
    end_timestamp = stats.index[-1]
    
    stats['isStats'] = True
    cols = ['srt.rtt.ms', 'srt.rttvar.ms', 'srt.rate.Mbps', 'srt.bw.Mbps']
    df = stats.join(umsg_ack_packets[cols].add_suffix('_tshark'), how='outer')
    df['isStats'] = df['isStats'].fillna(False)

    df = df[(df.index >= start_timestamp) & (df.index <= end_timestamp)]
    assert(df['isStats'][0] == True)
    assert(df['isStats'][-1] == True)

    print('\nJoined SRT stats and tshark statistics')
    print(df.head(10))
    print(df.tail(10))

    # Do interpolation
    cols_to_interpolate = [f'{col}_tshark' for col in cols]
    df.loc[:, cols_to_interpolate] = df.interpolate().fillna(method='bfill')
    df.loc[:, cols_to_interpolate] = df.round(2)

    print('\nInterpolated tshark statistics')
    print(df.head(10))
    print(df.tail(10))

    # Extract only stats dataframe timepoints (aligned SRT stats timepoints)
    df = df.loc[df['isStats'], df.columns != 'isStats']

    cols_to_int = [
        'pktSent_snd',
        'pktSndLoss_snd',
        'pktRecv_rcv',
        'pktRcvLoss_rcv',
    ]
    # TODO: Does not work
    # df.loc[:, cols_to_int] = df.astype('int32')
    for col in cols_to_int:
        df[col] = df[col].astype('int32')

    print('\nOnly SRT stats timepoints')
    print(df.head(10))
    print(df.tail(10))

    # Rearrange the columns
    cols_rearranged = [
        'pktSent_snd',
        'pktRecv_rcv',
        'pktSndLoss_snd',
        'pktRcvLoss_rcv',
        'msRTT_snd',
        'msRTT_rcv',
        'srt.rtt.ms_tshark',
        'srt.rttvar.ms_tshark',
        'mbpsBandwidth_snd',
        'mbpsBandwidth_rcv',
        'srt.bw.Mbps_tshark',
        # 'srt.rate.Mbps_tshark'
    ]
    df = df[cols_rearranged]

    return df

Example #56

0

Show file

File: encoders.py Project: rbilleci/pandora

 def transform(self, df: pd.DataFrame) -> pd.DataFrame:
     df_encoded = self._internal_encoder.transform(df[self.name])
     df_encoded = df_encoded.drop(columns=['intercept'], errors='ignore')
     df_encoded = self.update_column_names(df_encoded)
     return df.join(df_encoded)

Example #57

0

Show file

    def test_left_join_index_multi_match_multiindex(self):
        left = DataFrame(
            [
                ["X", "Y", "C", "a"],
                ["W", "Y", "C", "e"],
                ["V", "Q", "A", "h"],
                ["V", "R", "D", "i"],
                ["X", "Y", "D", "b"],
                ["X", "Y", "A", "c"],
                ["W", "Q", "B", "f"],
                ["W", "R", "C", "g"],
                ["V", "Y", "C", "j"],
                ["X", "Y", "B", "d"],
            ],
            columns=["cola", "colb", "colc", "tag"],
            index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
        )

        right = DataFrame(
            [
                ["W", "R", "C", 0],
                ["W", "Q", "B", 3],
                ["W", "Q", "B", 8],
                ["X", "Y", "A", 1],
                ["X", "Y", "A", 4],
                ["X", "Y", "B", 5],
                ["X", "Y", "C", 6],
                ["X", "Y", "C", 9],
                ["X", "Q", "C", -6],
                ["X", "R", "C", -9],
                ["V", "Y", "C", 7],
                ["V", "R", "D", 2],
                ["V", "R", "D", -1],
                ["V", "Q", "A", -3],
            ],
            columns=["col1", "col2", "col3", "val"],
        ).set_index(["col1", "col2", "col3"])

        result = left.join(right, on=["cola", "colb", "colc"], how="left")

        expected = DataFrame(
            [
                ["X", "Y", "C", "a", 6],
                ["X", "Y", "C", "a", 9],
                ["W", "Y", "C", "e", np.nan],
                ["V", "Q", "A", "h", -3],
                ["V", "R", "D", "i", 2],
                ["V", "R", "D", "i", -1],
                ["X", "Y", "D", "b", np.nan],
                ["X", "Y", "A", "c", 1],
                ["X", "Y", "A", "c", 4],
                ["W", "Q", "B", "f", 3],
                ["W", "Q", "B", "f", 8],
                ["W", "R", "C", "g", 0],
                ["V", "Y", "C", "j", 7],
                ["X", "Y", "B", "d", 5],
            ],
            columns=["cola", "colb", "colc", "tag", "val"],
            index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
        )

        tm.assert_frame_equal(result, expected)

        result = left.join(right,
                           on=["cola", "colb", "colc"],
                           how="left",
                           sort=True)

        expected = expected.sort_values(["cola", "colb", "colc"],
                                        kind="mergesort")

        tm.assert_frame_equal(result, expected)

Example #58

0

Show file

    def test_join_multi_levels(self):

        # GH 3662
        # merge multi-levels
        household = DataFrame(
            dict(
                household_id=[1, 2, 3],
                male=[0, 1, 0],
                wealth=[196087.3, 316478.7, 294750],
            ),
            columns=["household_id", "male", "wealth"],
        ).set_index("household_id")
        portfolio = DataFrame(
            dict(
                household_id=[1, 2, 2, 3, 3, 3, 4],
                asset_id=[
                    "nl0000301109",
                    "nl0000289783",
                    "gb00b03mlx29",
                    "gb00b03mlx29",
                    "lu0197800237",
                    "nl0000289965",
                    np.nan,
                ],
                name=[
                    "ABN Amro",
                    "Robeco",
                    "Royal Dutch Shell",
                    "Royal Dutch Shell",
                    "AAB Eastern Europe Equity Fund",
                    "Postbank BioTech Fonds",
                    np.nan,
                ],
                share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
            ),
            columns=["household_id", "asset_id", "name", "share"],
        ).set_index(["household_id", "asset_id"])
        result = household.join(portfolio, how="inner")
        expected = (DataFrame(
            dict(
                male=[0, 1, 1, 0, 0, 0],
                wealth=[
                    196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0
                ],
                name=[
                    "ABN Amro",
                    "Robeco",
                    "Royal Dutch Shell",
                    "Royal Dutch Shell",
                    "AAB Eastern Europe Equity Fund",
                    "Postbank BioTech Fonds",
                ],
                share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
                household_id=[1, 2, 2, 3, 3, 3],
                asset_id=[
                    "nl0000301109",
                    "nl0000289783",
                    "gb00b03mlx29",
                    "gb00b03mlx29",
                    "lu0197800237",
                    "nl0000289965",
                ],
            )).set_index([
                "household_id", "asset_id"
            ]).reindex(columns=["male", "wealth", "name", "share"]))
        tm.assert_frame_equal(result, expected)

        # equivalency
        result = merge(
            household.reset_index(),
            portfolio.reset_index(),
            on=["household_id"],
            how="inner",
        ).set_index(["household_id", "asset_id"])
        tm.assert_frame_equal(result, expected)

        result = household.join(portfolio, how="outer")
        expected = concat(
            [
                expected,
                (DataFrame(
                    dict(share=[1.00]),
                    index=MultiIndex.from_tuples(
                        [(4, np.nan)], names=["household_id", "asset_id"]),
                )),
            ],
            axis=0,
            sort=True,
        ).reindex(columns=expected.columns)
        tm.assert_frame_equal(result, expected)

        # invalid cases
        household.index.name = "foo"

        with pytest.raises(
                ValueError,
                match="cannot join with no overlapping index names"):
            household.join(portfolio, how="inner")

        portfolio2 = portfolio.copy()
        portfolio2.index.set_names(["household_id", "foo"])

        with pytest.raises(ValueError,
                           match="columns overlap but no suffix specified"):
            portfolio2.join(portfolio, how="inner")

Example #59

0

Show file

File: main.py Project: konumaru/moa_prediction

def add_dummies(data: pd.DataFrame, column: str):
    ohe = pd.get_dummies(data[column]).add_prefix(f'{column}_')
    data = data.drop(column, axis=1)
    data = data.join(ohe)
    return data

Example #60

0

Show file

 def insert_timeseries(self, df: pd.DataFrame, columns: list, timeseries: dict, interpolate=None, plot=False, title=None, columns_i: list=None, minType=None):
     if not interpolate:
         interpolate = self.INTERPOLATE
     i_date = copy.copy(self.START_DATE)
     td = timedelta(hours=1) if self.TIMESTEP == "hourly" else timedelta(days=1)  # hourly or daily
     data = []
     c = len(columns_i) if columns_i else len(columns)
     missing_data = [np.nan for i in range(0, c)]
     while i_date <= self.END_DATE:
         values = []
         datestamp = i_date.strftime("%Y-%m-%d %H")
         if datestamp in timeseries.keys():
             if columns_i:
                 for i in columns_i:
                     if minType:
                         v = datetime.strptime(timeseries[datestamp][i], minType)
                     else:
                         v = float(timeseries[datestamp][i])
                     if int(v) == -9998 or int(v) == -9999:
                         values.append(np.nan)
                     else:
                         values.append(v)
             else:
                 for v in timeseries[datestamp]:
                     v = float(v)
                     if int(v) == -9998 or int(v) == -9999:
                         values.append(np.nan)
                     else:
                         values.append(v)
         else:
             values = missing_data
         data.append(values)
         i_date = i_date + td
     for i in range(0, len(data)):
         data[i] = np.asarray(data[i], dtype=np.float64)
     temp_data = data.copy()
     data_df = pd.DataFrame(data, columns=columns, dtype=np.float64)
     merge = True
     for c in columns:
         if interpolate in ["linear", "slinear", "quadratic", "cubic", "values"]:
             data_df[c] = data_df[c].interpolate(method=interpolate).ffill().bfill()
         elif interpolate in ["polynomial", "spline"]:
             data_df[c] = data_df[c].interpolate(method=interpolate, order=4).ffill().bfill()
         elif interpolate == "gaussian":
             merge = False
             df = df.join(data_df, how='outer')
             df = self.random_gaussian(df, columns)
         else:
             data_df[c] = data_df[c].fillna(method=interpolate).ffill().bfill()
     if merge:
         df = df.join(data_df, how='outer')
     if plot:
         plot_data = pd.DataFrame()
         plot_columns = columns
         for i in range(0, len(columns)):
             c = columns[i]
             c0 = c + "_0"
             d_i = df[c]
             plot_data[c] = temp_data[:, i]
             plot_data[c0] = d_i
             plot_columns.append(c0)
         x = pd.to_datetime(df[["year", "month", "day", "hour"]])
         plot_data["datetime"] = x
         plot_data.set_index('datetime')
         colors = ['b', 'm', 'g', 'c', 'y', 'k']
         ax = plot_data.plot(x='datetime', y=plot_columns[0], linewidth=1.0, label=plot_columns[0], color=colors[0], figsize=(16, 8))
         plot_data.plot(x='datetime', y=plot_columns[0], linewidth=1.0, label=plot_columns[0], color=colors[0],
                        figsize=(16, 8))
         for c in range(1, len(plot_columns)):
             plot_data.plot(x='datetime', y=plot_columns[c], linewidth=1.0, label=plot_columns[c], color=colors[0],
                            figsize=(16, 8))
             plot_data.plot(x='datetime', y=plot_columns[c], linewidth=0.5, label=plot_columns[c], color=colors[c], ax=ax)
         ax.set_title("{} - {} interpolation".format(title, interpolate))
         plt.show()
     return df