def concat(objs): """Concantenate dask gdf objects Parameters ---------- objs : sequence of DataFrame, Series, Index A sequence of objects to be concatenated. """ objs = [_daskify(x) for x in objs] meta = gd.concat(_extract_meta(objs)) name = "concat-" + uuid4().hex dsk = {} divisions = [0] base = 0 lastdiv = 0 for obj in objs: for k, i in obj.__dask_keys__(): dsk[name, base + i] = k, i base += obj.npartitions divisions.extend([d + lastdiv for d in obj.divisions[1:]]) lastdiv = obj.divisions[-1] dasks = [o.dask for o in objs] dsk = merge(dsk, *dasks) return new_dd_object(dsk, name, meta, divisions)
def create_12_mon_features(joined_df, **kwargs): testdfs = [] n_months = 12 for y in range(1, n_months + 1): tmpdf = joined_df[[ 'loan_id', 'timestamp_year', 'timestamp_month', 'delinquency_12', 'upb_12' ]] tmpdf['josh_months'] = tmpdf['timestamp_year'] * 12 + tmpdf[ 'timestamp_month'] tmpdf['josh_mody_n'] = ( (tmpdf['josh_months'].astype('float64') - 24000 - y) / 12).floor() tmpdf = tmpdf.groupby(['loan_id', 'josh_mody_n'], method='hash').agg({ 'delinquency_12': 'max', 'upb_12': 'min' }) tmpdf['delinquency_12'] = (tmpdf['max_delinquency_12'] > 3).astype('int32') tmpdf['delinquency_12'] += (tmpdf['min_upb_12'] == 0).astype('int32') tmpdf.drop_column('max_delinquency_12') tmpdf['upb_12'] = tmpdf['min_upb_12'] tmpdf.drop_column('min_upb_12') tmpdf['timestamp_year'] = (((tmpdf['josh_mody_n'] * n_months) + 24000 + (y - 1)) / 12).floor().astype('int16') tmpdf['timestamp_month'] = np.int8(y) tmpdf.drop_column('josh_mody_n') testdfs.append(tmpdf) del (tmpdf) del (joined_df) return pygdf.concat(testdfs)
def test_dataframe_to_delayed(): nelem = 100 df = gd.DataFrame() df['x'] = np.arange(nelem) df['y'] = np.random.randint(nelem, size=nelem) ddf = dgd.from_pygdf(df, npartitions=5) delays = ddf.to_delayed() assert len(delays) == 5 # Concat the delayed partitions got = gd.concat([d.compute() for d in delays]) assert_frame_equal(got.to_pandas(), df.to_pandas()) # Check individual partitions divs = ddf.divisions assert len(divs) == len(delays) + 1 for i, part in enumerate(delays): s = divs[i] # The last divisions in the last index e = None if i + 1 == len(delays) else divs[i + 1] expect = df[s:e].to_pandas() got = part.compute().to_pandas() assert_frame_equal(got, expect)
def test_series_to_delayed(): nelem = 100 sr = gd.Series(np.random.randint(nelem, size=nelem)) dsr = dgd.from_pygdf(sr, npartitions=5) delays = dsr.to_delayed() assert len(delays) == 5 # Concat the delayed partitions got = gd.concat([d.compute() for d in delays]) assert isinstance(got, gd.Series) np.testing.assert_array_equal(got.to_pandas(), sr.to_pandas()) # Check individual partitions divs = dsr.divisions assert len(divs) == len(delays) + 1 for i, part in enumerate(delays): s = divs[i] # The last divisions in the last index e = None if i + 1 == len(delays) else divs[i + 1] expect = sr[s:e].to_pandas() got = part.compute().to_pandas() np.testing.assert_array_equal(got, expect)
def concat(*frames): frames = list(filter(len, frames)) if len(frames) > 1: return gd.concat(frames) elif len(frames) == 1: return frames[0] else: return None
def test_series_from_delayed(): delays = [get_combined_column(load_data(10 * i, i)) for i in range(1, 3)] out = dgd.from_delayed(delays) res = out.compute() assert isinstance(res, gd.Series) expected = gd.concat([d.compute() for d in delays]) np.testing.assert_array_equal(res.to_pandas(), expected.to_pandas())
def test_dataframe_from_delayed(): delays = [load_data(10 * i, i) for i in range(1, 3)] out = dgd.from_delayed(delays) res = out.compute() assert isinstance(res, gd.DataFrame) expected = gd.concat([d.compute() for d in delays]) assert_frame_equal(res.to_pandas(), expected.to_pandas())
def test_concat(index): df, df2, gdf, gdf2 = make_frames(index) # DataFrame res = gd.concat([gdf, gdf2, gdf]).to_pandas() sol = pd.concat([df, df2, df]) pd.util.testing.assert_frame_equal(res, sol, check_names=False) # Series for c in [i for i in ('x', 'y', 'z') if i != index]: res = gd.concat([gdf[c], gdf2[c], gdf[c]]).to_pandas() sol = pd.concat([df[c], df2[c], df[c]]) pd.util.testing.assert_series_equal(res, sol, check_names=False) # Index res = gd.concat([gdf.index, gdf2.index]).to_pandas() sol = df.index.append(df2.index) pd.util.testing.assert_index_equal(res, sol, check_names=False)
def test_concat_misordered_columns(): df, df2, gdf, gdf2 = make_frames(False) gdf2 = gdf2[['z', 'x', 'y']] df2 = df2[['z', 'x', 'y']] res = gd.concat([gdf, gdf2]).to_pandas() sol = pd.concat([df, df2]) pd.util.testing.assert_frame_equal(res, sol, check_names=False)
def test_dataframe_empty_concat(): gdf1 = DataFrame() gdf1['a'] = [] gdf1['b'] = [] gdf2 = gdf1.copy() gdf3 = gd.concat([gdf1, gdf2]) assert len(gdf3) == 0 assert len(gdf3.columns) == 2
def take(indices, depends): first = min(indices) last = max(indices) others = [] for d in depends: # TODO: this can be replaced with searchsorted # Normalize to index data in range before selection. firstindex = d.index[0] lastindex = d.index[-1] s = max(first, firstindex) e = min(last, lastindex) others.append(d.loc[s:e]) return gd.concat(others)
def _compare_frame(a, b, max_part_size, by): if a is not None and b is not None: joint = pygdf.concat([a, b]) sorten = joint.sort_values(by=by) # Split the sorted frame using the *max_part_size* lhs, rhs = sorten[:max_part_size], sorten[max_part_size:] # Replace empty frame with None return lhs or None, rhs or None elif a is None and b is None: return None, None elif a is None: return b.sort_values(by=by), None else: return a.sort_values(by=by), None
def shuffle(sr, prefixes, divs, *deps): idxs = sr.to_array() parts = np.asarray(get_parts(idxs, divs)) partdfs = [] for p, df in zip(sorted(frozenset(parts)), deps): cond = parts == p valididxs = idxs[cond] ordering = np.arange(len(idxs))[cond] selected = valididxs - prefixes[p] sel = df.take(selected).set_index(ordering) partdfs.append(sel) joined = gd.concat(partdfs).sort_index() return joined
def test_query_splitted_combine(): np.random.seed(0) df = pd.DataFrame({'x': np.random.randint(0, 5, size=10), 'y': np.random.normal(size=10)}) gdf = DataFrame.from_pandas(df) # Split the GDF s1 = gdf[:5] s2 = gdf[5:] # Do the query expr = 'x > 2' q1 = s1.query(expr) q2 = s2.query(expr) # Combine got = pygdf.concat([q1, q2]).to_pandas() # Should equal to just querying the original GDF expect = gdf.query(expr).to_pandas() assert_frame_equal(got, expect)
def test_dataframe_basic(): np.random.seed(0) df = DataFrame() # Populate with cuda memory df['keys'] = cuda.to_device(np.arange(10, dtype=np.float64)) np.testing.assert_equal(df['keys'].to_array(), np.arange(10)) assert len(df) == 10 # Populate with numpy array rnd_vals = np.random.random(10) df['vals'] = rnd_vals np.testing.assert_equal(df['vals'].to_array(), rnd_vals) assert len(df) == 10 assert df.columns == ('keys', 'vals') # Make another dataframe df2 = DataFrame() df2['keys'] = np.array([123], dtype=np.float64) df2['vals'] = np.array([321], dtype=np.float64) # Concat df = gd.concat([df, df2]) assert len(df) == 11 hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123]) hvals = np.asarray(rnd_vals.tolist() + [321]) np.testing.assert_equal(df['keys'].to_array(), hkeys) np.testing.assert_equal(df['vals'].to_array(), hvals) # As matrix mat = df.as_matrix() expect = np.vstack([hkeys, hvals]).T print(expect) print(mat) np.testing.assert_equal(mat, expect)
def nlargest_agg(x, **kwargs): return gd.concat(x).nlargest(**kwargs)
def do_combine(dfs): return combine(pygdf.concat(dfs).groupby(by=by))
def join(df, other, keys): others = [ other.query('{by}==@k'.format(by=by)) for k in sorted(keys) ] return gd.concat([df] + others)
def test_concat_errors(): df, df2, gdf, gdf2 = make_frames() # No objs with pytest.raises(ValueError): gd.concat([]) # Mismatched types with pytest.raises(ValueError): gd.concat([gdf, gdf.x]) # Unknown type with pytest.raises(ValueError): gd.concat(['bar', 'foo']) # Mismatched column dtypes with pytest.raises(ValueError): gd.concat([gdf.x, gdf.y]) with pytest.raises(ValueError): gd.concat([gdf.x, gdf.z]) # Mismatched index dtypes gdf3 = gdf2.set_index('z') gdf2.drop_column('z') with pytest.raises(ValueError): gd.concat([gdf2, gdf3]) # Mismatched columns with pytest.raises(ValueError): gd.concat([gdf, gdf2])
def finalize(results): return gd.concat(results)
def unique_k_agg(x, **kwargs): return gd.concat(x).unique_k(**kwargs)
def nsmallest_agg(x, **kwargs): return gd.concat(x).nsmallest(**kwargs)