Example #1
0
def test_errors_for_merge_on_frame_columns():
    a = pd.DataFrame({'x': [1, 2, 3, 4, 5]}, index=[1, 2, 3, 4, 5])
    b = pd.DataFrame({'y': [1, 2, 3, 4, 5]}, index=[5, 4, 3, 2, 1])

    aa = dd.from_pandas(a, npartitions=3, sort=False)
    bb = dd.from_pandas(b, npartitions=2)

    with pytest.raises(NotImplementedError):
        dd.merge(aa, bb, left_on='x', right_on=bb.y)

    with pytest.raises(NotImplementedError):
        dd.merge(aa, bb, left_on=aa.x, right_on=bb.y)
Example #2
0
def test_merge():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    list_eq(dd.merge(a, b, left_index=True, right_index=True),
            pd.merge(A, B, left_index=True, right_index=True))

    list_eq(dd.merge(a, b, on='y'),
            pd.merge(A, B, on='y'))

    list_eq(dd.merge(a, b, left_on='x', right_on='z'),
            pd.merge(A, B, left_on='x', right_on='z'))

    list_eq(dd.merge(a, b),
            pd.merge(A, B))

    list_eq(dd.merge(a, B),
            pd.merge(A, B))

    list_eq(dd.merge(A, b),
            pd.merge(A, B))

    list_eq(dd.merge(A, B),
            pd.merge(A, B))

    list_eq(dd.merge(a, b, left_index=True, right_index=True),
            pd.merge(A, B, left_index=True, right_index=True))
Example #3
0
def execute_cross_join(op, left, right, **kwargs):
    """Execute a cross join in dask.

    Notes
    -----
    We create a dummy column of all :data:`True` instances and use that as the
    join key. This results in the desired Cartesian product behavior guaranteed
    by cross join.

    """
    # generate a unique name for the temporary join key
    key = "cross_join_{}".format(ibis.util.guid())
    join_key = {key: True}
    new_left = left.assign(**join_key)
    new_right = right.assign(**join_key)

    # inner/outer doesn't matter because every row matches every other row
    result = dd.merge(
        new_left,
        new_right,
        how='inner',
        on=key,
        suffixes=constants.JOIN_SUFFIXES,
    )

    # remove the generated key
    del result[key]

    return result
def combine_data(
    songwriter_df_path=paths["songwriter_df_path"],
    compressed_genre_path=paths["compressed_genre_path"],
    pitch_timbre_df_path=paths["segment_path"],
    song_features_path=paths["song_features_path"],
):
    """Creates dask dataframe of songs with features ready for modeling"""

    list_of_paths = [
        songwriter_df_path,
        compressed_genre_path,
        pitch_timbre_df_path,
        song_features_path,
    ]
    latest_file_list = list(map(find_latest_file, list_of_paths))

    songwriter_df = (
        dd.read_csv(
            latest_file_list[0],
            # should be pd.Int32Dtype() but running into error
            dtype={
                "IPI": np.float64
            },
        ).rename(columns={
            "Unnamed: 0": "index"
        }).set_index("index"))
    compressed_genre_df = (dd.read_csv(
        latest_file_list[1]).rename(columns={
            "Unnamed: 0": "index"
        }).set_index("index"))
    pitch_timbre_df = dd.read_csv(
        latest_file_list[2]).rename(columns={"Unnamed: 0": "track_id"})
    song_features_df = (dd.read_csv(
        latest_file_list[3]).rename(columns={
            "Unnamed: 0": "index"
        }).set_index("index"))

    songwriter_and_genres = dd.merge(songwriter_df,
                                     compressed_genre_df,
                                     on="track_id")
    songwriter_genres_and_pt = dd.merge(songwriter_and_genres,
                                        pitch_timbre_df,
                                        on="track_id")
    ready_for_modeling_df = dd.merge(songwriter_genres_and_pt,
                                     song_features_df,
                                     on="track_id")
    return ready_for_modeling_df
Example #5
0
def process_introns(data_dir, num_samples, num_threads=4):
    def chunks(l, n):
        """Yield successive n-sized chunks from l."""
        for i in range(0, len(l), n):
            yield l[i:i + n]

    dfs = []
    for i in range(num_samples):
        columns = ["chromosome", "start", "end", f"{i+1}_count", "strand"]
        if os.path.exists(data_dir / f'sample_{i+1}.splice.gz'):
            filename = data_dir / f'sample_{i+1}.splice.gz'
            _df = dd.read_csv(filename,
                              sep=' ',
                              blocksize=None,
                              names=columns,
                              usecols=[0, 1, 2, 3, 4],
                              compression='gzip')
        elif os.path.exists(data_dir / f'sample_{i+1}.splice'):
            filename = data_dir / f'sample_{i+1}.splice'
            _df = dd.read_csv(filename,
                              sep=' ',
                              blocksize=None,
                              names=columns,
                              usecols=[0, 1, 2, 3, 4])
        else:
            raise Exception("Splice file doesn't exist!")

        # drop the negative read counts if any
        _df = _df[_df[f"{i+1}_count"] >= 0]
        dfs.append(_df)

    while len(dfs) > 1:
        _list = []
        for chunk in chunks(dfs, 5):
            df = delayed(reduce)(lambda x, y: dd.merge(
                x, y, how='outer', on=['chromosome', 'start', 'end', 'strand'
                                       ]), chunk)
            _list.append(df)
        dfs = _list

    df = compute(*dfs, num_workers=num_threads)[0]
    df.fillna(0, inplace=True)

    if num_samples > 10:
        column_names = list(
            set(df.columns.values) -
            set(['chromosome', 'start', 'end', 'strand']))
        df = df[(df[column_names] > 3).any(axis=1)]

    coord_columns = ['chromosome', 'strand', 'start', 'end']
    index_df = df[coord_columns].copy()
    index_df['index'] = df[coord_columns].apply(lambda x: tuple(x), axis=1)
    index_df.set_index(coord_columns, inplace=True)

    df['index'] = df[coord_columns].apply(lambda x: tuple(x), axis=1)
    df.drop(coord_columns, axis=1, inplace=True)
    df.set_index('index', inplace=True)

    return df, index_df
Example #6
0
def count_cross_feat_hour(df, feat_1, feat_2):
    cname = feat_1 + "_" + feat_2 + "hour"
    add = df.groupby(
        [feat_1, feat_2, "hour"],
        sort=False).size().reset_index().rename(columns={0: cname})
    df = dd.merge(df, add, 'left', on=[feat_1, feat_2, "hour"])
    df[cname] = df[cname].astype(np.int32)
    return df
Example #7
0
def gen_is_first_feat(train_data, feat):
    train_data_2 = train_data.sort_values(by=["user_id", feat, "context_timestamp"], ascending=True)
    first = train_data_2.drop_duplicates(["user_id", feat])
    first['is_first_user_' + feat] = 1
    first = first[["user_id", feat, "context_timestamp", 'is_first_user_' + feat]]
    train_data = dd.merge(train_data, first, how="left", on=["user_id", feat, "context_timestamp"])
    train_data = train_data.fillna({'is_first_user_' + feat: 0})
    first = first.rename(columns={"context_timestamp": "is_first_time_gap_" + feat})[
        ["user_id", feat, "is_first_time_gap_" + feat]]
    train_data = dd.merge(train_data, first, on=["user_id", feat], how="left")
    train_data["is_first_time_gap_" + feat] = (
    train_data["is_first_time_gap_" + feat] - train_data["context_timestamp"]).dt.total_seconds()

    train_data["is_first_time_gap_" + feat] = train_data["is_first_time_gap_" + feat].astype(np.int32)
    train_data['is_first_user_' + feat] = train_data['is_first_user_' + feat].astype(np.int32)
    del train_data_2, first
    return train_data
def add_category(wallets, df):
    wallet_owners = wallets[['owner', 'category']].drop_duplicates(
        subset='owner', keep='last').reset_index(drop=True)

    sender = dd.merge(df,
                      wallet_owners,
                      left_on='sender_name',
                      right_on='owner',
                      how='left')
    columns = [
        'receiver_name', 'receiver_category', 'sender_name', 'sender_category'
    ]
    sender = sender.drop(columns, axis=1)
    sender = sender.rename(columns={
        "owner": "sender_name",
        "category": "sender_category"
    })

    receiver = receiver = dd.merge(df,
                                   wallet_owners,
                                   left_on='receiver_name',
                                   right_on='owner',
                                   how='left')
    columns = [
        'sender_name', 'sender_category', 'receiver_name', 'receiver_category'
    ]
    receiver = receiver.drop(columns, axis=1)
    receiver = receiver.rename(columns={
        "owner": "receiver_name",
        "category": "receiver_category"
    })

    tnx_category = dd.merge(sender,
                            receiver,
                            how='inner',
                            on=[
                                'hash', 'block_timestamp', 'sender',
                                'receiver', 'date', 'btc', 'dollar',
                                'percent_marketcap', 'PriceUSD'
                            ])
    tnx_category = tnx_category[[
        'hash', 'block_timestamp', 'sender', 'receiver', 'btc', 'dollar',
        'PriceUSD', 'percent_marketcap', 'sender_name', 'sender_category',
        'receiver_name', 'receiver_category'
    ]]
    return tnx_category
Example #9
0
def main():
    # create two dask dataframes (optimized for large datasets)
    df1 = dd.read_csv(FILE1)
    df2 = dd.read_csv(FILE2)

    # merge them by doing inner join operation
    df3 = dd.merge(df1, df2, on=['name', 'age', 'email'], how='inner')

    print(df3.head())
Example #10
0
def getClustersIndex(clusters, users_genres):
    clusters = dd.from_dask_array(clusters, )
    clusters = clusters.reset_index().rename(columns={0: 'cluster'})
    users_genres = users_genres.reset_index()
    clusters_index = dd.merge(users_genres,
                              clusters,
                              left_index=True,
                              right_on='index')
    return clusters_index[['userId', 'cluster']]
Example #11
0
def test_join(how, left, right, df1, df2):
    expr = left.join(right, left.key == right.key,
                     how=how)[left, right.other_value, right.key3]
    result = expr.compile()
    expected = dd.merge(df1, df2, how=how, on='key')
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #12
0
def execute_grouped_window_op(
    op,
    data,
    window,
    scope,
    timecontext,
    aggcontext,
    clients,
    **kwargs,
):
    # extract the parent
    (root, ) = op.root_tables()
    root_expr = root.to_expr()

    root_data = execute(
        root_expr,
        scope=scope,
        timecontext=timecontext,
        clients=clients,
        aggcontext=aggcontext,
        **kwargs,
    )

    group_by = window._group_by
    grouping_keys = [
        key_op.name for key_op in map(operator.methodcaller('op'), group_by)
    ]

    grouped_root_data = root_data.groupby(grouping_keys)
    scope = scope.merge_scopes(
        [
            Scope({t: grouped_root_data}, timecontext)
            for t in op.expr.op().root_tables()
        ],
        overwrite=True,
    )

    result = execute_with_scope(
        expr=op.expr,
        scope=scope,
        timecontext=timecontext,
        aggcontext=aggcontext,
        clients=clients,
        **kwargs,
    )
    # If the grouped operation we performed is not an analytic UDF we have to
    # realign the output to the input.
    if not isinstance(op.expr._arg, ops.AnalyticVectorizedUDF):
        result = dd.merge(
            root_data[result.index.name].to_frame(),
            result.to_frame(),
            left_on=result.index.name,
            right_index=True,
        )[result.name]
        result.divisions = root_data.divisions

    return result
Example #13
0
def test_join_project_left_table(how, left, right, df1, df2):
    expr = left.join(right, left.key == right.key, how=how)[left, right.key3]
    result = expr.compile()
    expected = dd.merge(df1, df2, how=how,
                        on='key')[list(left.columns) + ['key3']]
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #14
0
def mergeCSV(csv1, csv2, csv1_key, csv2_key, type_join):
    #joins two csv tables (read from files) based on some specified keys (csv1_key for file csv1 and csv2_key for file csv2). type_join specifies the type of join to perform
    table1 = dd.read_csv(csv1, low_memory=False)
    table2 = dd.read_csv(csv2, low_memory=False)
    merged_table = dd.merge(table1,
                            table2,
                            left_on=csv1_key,
                            right_on=csv2_key,
                            how=type_join)
    return merged_table
Example #15
0
def test_cross_join_project_left_table(left, right, df1, df2):
    expr = left.cross_join(right)[left, right.key3]
    result = expr.compile()
    expected = dd.merge(
        df1.assign(dummy=1), df2.assign(dummy=1), how='inner', on='dummy'
    ).rename(columns={'key_x': 'key'})[list(left.columns) + ['key3']]
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #16
0
def write_out_combined_data():
    state_id = STATE
    households = dd.read_csv(
        r'output/state_{}_puma_*_households.csv'.format(state_id))
    people = dd.read_csv(r'output/state_{}_puma_*_people.csv'.format(state_id))
    combined = dd.merge(people, households, on=[inputs.HOUSEHOLD_ID.name])
    cdf = combined.compute()
    cdf.sort_values('household_id', axis=0, inplace=True)
    cdf.loc[:, 'num_people'] = cdf.num_people.replace('4+', 4).astype(int)
    cdf.to_csv(r'output/state_{}_combined_data_full.csv'.format(STATE))
Example #17
0
def test_join_with_post_expression_selection(how, left, right, df1, df2):
    join = left.join(right, left.key == right.key, how=how)
    expr = join[left.key, left.value, right.other_value]
    result = expr.compile()
    expected = dd.merge(df1, df2, on='key',
                        how=how)[['key', 'value', 'other_value']]
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #18
0
def test_merge_by_multiple_columns():

    pdf1l = pd.DataFrame({'a': list('abcdefghij'),
                          'b': list('abcdefghij'),
                          'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
                         index=list('abcdefghij'))
    pdf1r = pd.DataFrame({'d': list('abcdefghij'),
                          'e': list('abcdefghij'),
                          'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]},
                         index=list('abcdefghij'))

    pdf2l = pd.DataFrame({'a': list('abcdeabcde'),
                          'b': list('abcabcabca'),
                          'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
                         index=list('abcdefghij'))
    pdf2r = pd.DataFrame({'d': list('edcbaedcba'),
                          'e': list('aaabbbcccd'),
                          'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]},
                         index=list('fghijklmno'))

    pdf3l = pd.DataFrame({'a': list('aaaaaaaaaa'),
                          'b': list('aaaaaaaaaa'),
                          'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
                         index=list('abcdefghij'))
    pdf3r = pd.DataFrame({'d': list('aaabbbccaa'),
                          'e': list('abbbbbbbbb'),
                          'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]},
                         index=list('ABCDEFGHIJ'))

    for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:

        for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:

            ddl = dd.from_pandas(pdl, lpart)
            ddr = dd.from_pandas(pdr, rpart)

            for how in ['inner', 'outer', 'left', 'right']:
                eq(ddl.join(ddr, how=how), pdl.join(pdr, how=how))
                eq(ddr.join(ddl, how=how), pdr.join(pdl, how=how))

                eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True),
                   pd.merge(pdl, pdr, how=how, left_index=True, right_index=True))
                eq(dd.merge(ddr, ddl, how=how, left_index=True, right_index=True),
                   pd.merge(pdr, pdl, how=how, left_index=True, right_index=True))

                # hash join
                list_eq(dd.merge(ddl, ddr, how=how, left_on='a', right_on='d'),
                        pd.merge(pdl, pdr, how=how, left_on='a', right_on='d'))
                list_eq(dd.merge(ddl, ddr, how=how, left_on='b', right_on='e'),
                        pd.merge(pdl, pdr, how=how, left_on='b', right_on='e'))

                list_eq(dd.merge(ddr, ddl, how=how, left_on='d', right_on='a'),
                        pd.merge(pdr, pdl, how=how, left_on='d', right_on='a'))
                list_eq(dd.merge(ddr, ddl, how=how, left_on='e', right_on='b'),
                        pd.merge(pdr, pdl, how=how, left_on='e', right_on='b'))

                list_eq(dd.merge(ddl, ddr, how=how, left_on=['a', 'b'], right_on=['d', 'e']),
                        pd.merge(pdl, pdr, how=how, left_on=['a', 'b'], right_on=['d', 'e']))
Example #19
0
def _find_objects(ndim, df1, df2):
    """Main utility function for find_objects."""
    meta = dd.utils.make_meta([(i, object) for i in range(ndim)])
    if isinstance(df1, Delayed):
        df1 = dd.from_delayed(df1, meta=meta)
    if isinstance(df2, Delayed):
        df2 = dd.from_delayed(df2, meta=meta)
    ddf = dd.merge(df1, df2, how="outer", left_index=True, right_index=True)
    result = ddf.apply(_merge_bounding_boxes, ndim=ndim, axis=1, meta=meta)
    return result
Example #20
0
def test_merge_by_multiple_columns(how):

    pdf1l = pd.DataFrame({'a': list('abcdefghij'),
                          'b': list('abcdefghij'),
                          'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
                         index=list('abcdefghij'))
    pdf1r = pd.DataFrame({'d': list('abcdefghij'),
                          'e': list('abcdefghij'),
                          'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]},
                         index=list('abcdefghij'))

    pdf2l = pd.DataFrame({'a': list('abcdeabcde'),
                          'b': list('abcabcabca'),
                          'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
                         index=list('abcdefghij'))
    pdf2r = pd.DataFrame({'d': list('edcbaedcba'),
                          'e': list('aaabbbcccd'),
                          'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]},
                         index=list('fghijklmno'))

    pdf3l = pd.DataFrame({'a': list('aaaaaaaaaa'),
                          'b': list('aaaaaaaaaa'),
                          'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
                         index=list('abcdefghij'))
    pdf3r = pd.DataFrame({'d': list('aaabbbccaa'),
                          'e': list('abbbbbbbbb'),
                          'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]},
                         index=list('ABCDEFGHIJ'))

    for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:

        for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:

            ddl = dd.from_pandas(pdl, lpart)
            ddr = dd.from_pandas(pdr, rpart)

            eq(ddl.join(ddr, how=how), pdl.join(pdr, how=how))
            eq(ddr.join(ddl, how=how), pdr.join(pdl, how=how))

            eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True),
               pd.merge(pdl, pdr, how=how, left_index=True, right_index=True))
            eq(dd.merge(ddr, ddl, how=how, left_index=True, right_index=True),
               pd.merge(pdr, pdl, how=how, left_index=True, right_index=True))

            # hash join
            list_eq(dd.merge(ddl, ddr, how=how, left_on='a', right_on='d'),
                    pd.merge(pdl, pdr, how=how, left_on='a', right_on='d'))
            list_eq(dd.merge(ddl, ddr, how=how, left_on='b', right_on='e'),
                    pd.merge(pdl, pdr, how=how, left_on='b', right_on='e'))

            list_eq(dd.merge(ddr, ddl, how=how, left_on='d', right_on='a'),
                    pd.merge(pdr, pdl, how=how, left_on='d', right_on='a'))
            list_eq(dd.merge(ddr, ddl, how=how, left_on='e', right_on='b'),
                    pd.merge(pdr, pdl, how=how, left_on='e', right_on='b'))

            list_eq(dd.merge(ddl, ddr, how=how, left_on=['a', 'b'], right_on=['d', 'e']),
                    pd.merge(pdl, pdr, how=how, left_on=['a', 'b'], right_on=['d', 'e']))
Example #21
0
def test_cross_join(left, right, df1, df2):
    expr = left.cross_join(right)[left, right.other_value, right.key3]
    result = expr.compile()
    expected = dd.merge(
        df1.assign(dummy=1), df2.assign(dummy=1), how='inner', on='dummy'
    ).rename(columns={'key_x': 'key'})
    del expected['dummy'], expected['key_y']
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #22
0
def test_join_with_multiple_predicates(how, left, right, df1, df2):
    expr = left.join(
        right, [left.key == right.key, left.key2 == right.key3], how=how
    )[left, right.key3, right.other_value]
    result = expr.compile()
    expected = dd.merge(
        df1, df2, how=how, left_on=['key', 'key2'], right_on=['key', 'key3']
    ).reset_index(drop=True)
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #23
0
def merge_columns(df1, df2, cols):
    """
    Function to merge datasets into one based on their common column
    df1 = first dataframe
    df2 = second dataframe
    cols = column or columns to merge on
    we want to check and remove the Unnamed:0 column
    """
    data = dd.merge(df1, df2, on=cols)
    if 'Unnamed: 0' in data:
        data = data.drop('Unnamed: 0', axis=1)
    return data
Example #24
0
def combine_vitals_stays(df_1, df_2):
    """combine the dataframes"""
    if not os.path.exists('data/vitals_stays.csv'):
        vitals_stays = dd.merge(df_1,
                                df_2,
                                how='inner',
                                on=['SUBJECT_ID', 'HADM_ID'])

        vitals_stays = vitals_stays.compute()
        vitals_stays.to_csv('data/vitals_stays.csv', index=False)

    return dd.read_csv('data/vitals_stays.csv',
                       parse_dates=['ADMITTIME', 'DISCHTIME'])
Example #25
0
def add_y_sum_counts(df, cfg):

    meta = pd.Series(
        [],
        name="y_sum_total",
        index=pd.Index([], name="tax_id", dtype=int),
        dtype=int,
    )

    ds = df.groupby("tax_id").apply(compute_y_sum_total, cfg, meta=meta)
    ds = ds.reset_index()
    df = dd.merge(df, ds, on=["tax_id"])
    return df
Example #26
0
def test_half_indexed_dataframe_avoids_shuffle():
    a = pd.DataFrame({"x": np.random.randint(100, size=1000)})
    b = pd.DataFrame({"y": np.random.randint(100, size=100)}, index=np.random.randint(100, size=100))

    aa = dd.from_pandas(a, npartitions=100)
    bb = dd.from_pandas(b, npartitions=2)

    c = pd.merge(a, b, left_index=True, right_on="y")
    cc = dd.merge(aa, bb, left_index=True, right_on="y", shuffle="tasks")

    list_eq(c, cc)

    assert len(cc.dask) < 500
Example #27
0
def filterdata_long_dask(inputdf, threshold=None, nr_of_partitions=None):
    #this function was implemented with help of Jose A. Jimenez
    #https://stackoverflow.com/questions/62957110/pandas-selecting-multiple-rows-based-on-column-pair/

    import dask.dataframe as dd

    initialmean = inputdf.loc[inputdf["timepoint"] == 0].mean().array[-1]
    initialsd = inputdf.loc[inputdf["timepoint"] == 0].std().array[-1]

    if threshold is None:
        threshold = initialmean + initialsd
        pre_activated_t0 = inputdf[(inputdf['timepoint'] == 0)
                                   & (inputdf['value'] > threshold)]
    if threshold is not None:
        pre_activated_t0 = inputdf[(inputdf['timepoint'] == 0)
                                   & (inputdf['value'] > threshold)]

    pre_activated = inputdf.merge(pre_activated_t0[["measurement", "roi"]],
                                  how="inner",
                                  on=["measurement", "roi"])

    if nr_of_partitions is None:
        nr_of_partitions = 30

    input_dd = dd.from_pandas(inputdf, npartitions=nr_of_partitions)
    preactivated_dd = dd.from_pandas(pre_activated,
                                     npartitions=nr_of_partitions)

    merger = dd.merge(input_dd,
                      preactivated_dd,
                      how="left",
                      on=["timepoint", "measurement", "roi", "value"])
    filtereddf = merger.compute()

    filtereddf = filtereddf[pd.isna(filtereddf["group_y"])]
    filtereddf.drop("group_y", axis=1, inplace=True)
    filtereddf.columns = list(inputdf.columns)

    length_input = len(inputdf[inputdf["timepoint"] == 0])
    length_filtered = len(filtereddf[filtereddf["timepoint"] == 0])
    delta = length_input - length_filtered

    print('Initital Mean: ' + str(initialmean) + '. Initial SD: ' +
          str(initialsd))
    print('Threshold: ' + str(threshold))
    print('Dataframe was filtered')
    print('Total cells: ' + str(length_input))
    print(str(delta) + ' cells were removed')
    print('\n')

    return filtereddf, pre_activated
Example #28
0
def reading_data(diffuse,data_size1):
    # Import data in h5py
    gammas = h5.File("../../data/3_gen/gammas.hdf5","r")
    # Converting to pandas
    gamma_array_df = pd.DataFrame(data=dict(gammas['array_events']))
    gamma_runs_df = pd.DataFrame(data=dict(gammas['runs']))
    gamma_telescope_df = pd.DataFrame(data=dict(gammas['telescope_events']))

    gamma_array_dd = dd.from_pandas(gamma_array_df,chunksize=1000000)
    gamma_telescope_dd = dd.from_pandas(gamma_telescope_df,chunksize=1000000)


    #merging of array and telescope data and shuffle of proton and gamma
    gamma_merge = dd.merge(gamma_telescope_dd,gamma_array_dd)
    #there are some nan in width the needed to be deleted
    gamma_merge = gamma_merge.dropna()
    max_size = gamma_merge.shape[0]
    if(data_size1 < 0):
        data_size = max_size-1
    else:
        data_size = data_size1
    data = gamma_merge[:data_size]
    import IPython; IPython.embed()

    if(diffuse):
        gammas_diffuse = h5.File("../data/3_gen/gammas_diffuse.hdf5","r")

        gamma_diffuse_array_df = pd.DataFrame(data=dict(gammas_diffuse['array_events']))
        gamma_diffuse_runs_df = pd.DataFrame(data=dict(gammas_diffuse['runs']))
        gamma_diffuse_telescope_df = pd.DataFrame(data=dict(gammas_diffuse['telescope_events']))

        max_size_diffuse = gamma_diffuse_telescope_df.shape[0]
        if(data_size1 < 0):
            data_size = max_size_diffuse-1
        else:
            data_size = data_size1

        gamma_diffuse_array_df = gamma_diffuse_array_df.iloc[:data_size]
        gamma_diffuse_runs_df = gamma_diffuse_runs_df.iloc[:data_size]
        gamma_diffuse_telescope_df = gamma_diffuse_telescope_df.iloc[:data_size]
        gamma_diffuse_merge = pd.merge(gamma_diffuse_array_df,gamma_diffuse_telescope_df,on=list(['array_event_id','run_id']))
        gamma_diffuse_merge = gamma_diffuse_merge.set_index(['run_id','array_event_id'])
        gamma_diffuse_merge = gamma_diffuse_merge.dropna(axis=0)
        gamma_diffuse_merge = gamma_diffuse_merge.reset_index()
        gamma_merge = gamma_merge.reset_index()
        data = pd.concat([gamma_merge,gamma_diffuse_merge])
        data = data.set_index(['run_id','array_event_id'])
        data = data.dropna(axis=1)
        print("Using diffused data...")

    return data;
Example #29
0
def test_half_indexed_dataframe_avoids_shuffle():
    a = pd.DataFrame({'x': np.random.randint(100, size=1000)})
    b = pd.DataFrame({'y': np.random.randint(100, size=100)},
                     index=np.random.randint(100, size=100))

    aa = dd.from_pandas(a, npartitions=100)
    bb = dd.from_pandas(b, npartitions=2)

    c = pd.merge(a, b, left_index=True, right_on='y')
    cc = dd.merge(aa, bb, left_index=True, right_on='y', shuffle='tasks')

    list_eq(c, cc)

    assert len(cc.dask) < 500
Example #30
0
def test_merge_maintains_columns():
    lhs = pd.DataFrame({'A': [1, 2, 3],
                        'B': list('abc'),
                        'C': 'foo',
                        'D': 1.0},
                       columns=list('DCBA'))
    rhs = pd.DataFrame({'G': [4, 5],
                        'H': 6.0,
                        'I': 'bar',
                        'B': list('ab')},
                       columns=list('GHIB'))
    ddf = dd.from_pandas(lhs, npartitions=1)
    merged = dd.merge(ddf, rhs, on='B').compute()
    assert tuple(merged.columns) == ('D', 'C', 'B', 'A', 'G', 'H', 'I')
def gen_is_last(train_data):
    train_data_2 = train_data.sort_values(by=["user_id", "context_timestamp"],
                                          ascending=False)
    last = train_data_2.drop_duplicates(["user_id"])
    last['is_last'] = 1
    last = last[["user_id", "context_timestamp", "is_last"]]
    train_data = dd.merge(train_data,
                          last,
                          how="left",
                          on=["user_id", "context_timestamp"])
    train_data = train_data.fillna({"is_last": 0})
    last = last.rename(columns={"context_timestamp": "is_last_time_gap"})[[
        "user_id", "is_last_time_gap"
    ]]
    train_data = dd.merge(train_data, last, on=["user_id"], how="left")
    train_data["is_last_time_gap"] = (
        train_data["is_last_time_gap"] -
        train_data["context_timestamp"]).dt.total_seconds()
    train_data["is_last_time_gap"] = train_data["is_last_time_gap"].astype(
        np.int32)
    train_data['is_last'] = train_data['is_last'].astype(np.int32)
    del train_data_2, last
    return train_data
Example #32
0
def test_join_with_project_right_duplicate_column(client, how, left, df1, df3):
    # also test that the order of operands in the predicate doesn't matter
    right = client.table('df3')
    join = left.join(right, ['key'], how=how)
    expr = join[left.key, right.key2, right.other_value]
    result = expr.compile()

    expected = (dd.merge(df1, df3, on='key', how=how).drop(
        ['key2_x', 'key3', 'value'],
        axis=1).rename(columns={'key2_y': 'key2'}))
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
def test_merge_maintains_columns():
    lhs = pd.DataFrame({'A': [1, 2, 3],
                        'B': list('abc'),
                        'C': 'foo',
                        'D': 1.0},
                       columns=list('DCBA'))
    rhs = pd.DataFrame({'G': [4, 5],
                        'H': 6.0,
                        'I': 'bar',
                        'B': list('ab')},
                       columns=list('GHIB'))
    ddf = dd.from_pandas(lhs, npartitions=1)
    merged = dd.merge(ddf, rhs, on='B').compute()
    assert tuple(merged.columns) == ('D', 'C', 'B', 'A', 'G', 'H', 'I')
Example #34
0
def add_diagnosis(vitals_stays):
    """Add diagnosis to dataset"""
    admission = dd.read_csv('data/ADMISSIONS.csv')
    diagnosis = dd.read_csv('data/DIAGNOSES_ICD.csv')

    # combine the data frames
    admission_diag = dd.merge(admission,
                              diagnosis,
                              on=['SUBJECT_ID', 'HADM_ID'],
                              how='outer')
    admission_diag = admission_diag.compute()

    # mask for only the patients in our data
    admission_diag = admission_diag[admission_diag.HADM_ID.isin(
        vitals_stays.HADM_ID.compute().values)]

    # convert icd9 codes
    e_mask = admission_diag.ICD9_CODE.str.startswith('E')
    # starts with 'E' and longer than 4
    admission_diag.loc[e_mask,
                       'ICD9_CODE'] = admission_diag.loc[e_mask,
                                                         'ICD9_CODE'].str[:4]

    # doesn't start with 'E' and longer than 3
    admission_diag.loc[~e_mask,
                       'ICD9_CODE'] = admission_diag.loc[~e_mask,
                                                         'ICD9_CODE'].str[:3]

    # use crosstab to convert to binary matrix
    admission_diag = admission_diag[['HADM_ID', 'ICD9_CODE']]
    admission_diag = np.clip(
        pd.crosstab(admission_diag.HADM_ID, admission_diag.ICD9_CODE), 0, 1)
    admission_diag['HADM_ID'] = admission_diag.index

    final_df = dd.merge(vitals_stays, admission_diag, on='HADM_ID')

    return final_df.compute()
Example #35
0
def constructDictFromCSVFiles(csv1,csv2,csv1_key,csv2_key,object_name,attribute_name,type_join):
	if csv1!=csv2 or type_join.lower()!='none':
		print('2 different csvs')
		table1=dd.read_csv(csv1,header=0,usecols=[csv1_key,object_name],low_memory=False)
		table2=dd.read_csv(csv2,header=0,usecols=[csv2_key,attribute_name],low_memory=False)
		merged_table=dd.merge(table1,table2,left_on=csv1_key,right_on=csv2_key,how=type_join)
	else:
		print('no joins')
		merged_table=dd.read_csv(csv1,header=0,usecols=[csv1_key,object_name,attribute_name],low_memory=False)
	dico=collections.defaultdict(list)
	for row in merged_table.itertuples():
		key=getattr(row,object_name)
		att=getattr(row,attribute_name)
		dico[key].append(att)
	return dico
Example #36
0
def test_merge_maintains_columns():
    lhs = pd.DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": "foo", "D": 1.0}, columns=list("DCBA"))
    rhs = pd.DataFrame({"G": [4, 5], "H": 6.0, "I": "bar", "B": list("ab")}, columns=list("GHIB"))
    ddf = dd.from_pandas(lhs, npartitions=1)
    merged = dd.merge(ddf, rhs, on="B").compute()
    assert tuple(merged.columns) == ("D", "C", "B", "A", "G", "H", "I")
Example #37
0
def test_merge_maintains_columns(lhs, rhs):
    ddf = dd.from_pandas(lhs, npartitions=1)
    merged = dd.merge(ddf, rhs, on='B').compute()
    assert tuple(merged.columns) == ('D', 'C', 'B', 'A', 'G', 'H', 'I')
Example #38
0
import glob
import numpy as np
import pandas as pd
import os
import dask.dataframe as dd

repeats = dd.read_csv("repeats_hg19.csv")

anno = dd.read_table("RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.CATGAC.dan.anno")

df1 = dd.merge(anno, repeats, on="chr", how="outer", suffixes=("","_repeat"))
df1.to_csv("find_repeatsTESToutput.csv", index=False)
df1 = df1[(repeats.chr == row.chr) & (anno.start >= repeats.begin) & (anno.start <= repeats.end)]
df1 = dd.merge(anno, df1, on = ["chr"])
df1.to_csv("find_repeatsTEST2.csv", index=False).compute(num_workers=20)
Example #39
0
def test_merge_by_index_patterns(how):

    pdf1l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                          'b': [7, 6, 5, 4, 3, 2, 1]})
    pdf1r = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7],
                          'd': [7, 6, 5, 4, 3, 2, 1]})

    pdf2l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                          'b': [7, 6, 5, 4, 3, 2, 1]},
                          index=list('abcdefg'))
    pdf2r = pd.DataFrame({'c': [7, 6, 5, 4, 3, 2, 1],
                          'd': [7, 6, 5, 4, 3, 2, 1]},
                          index=list('abcdefg'))

    pdf3l = pdf2l
    pdf3r = pd.DataFrame({'c': [6, 7, 8, 9],
                          'd': [5, 4, 3, 2]},
                          index=list('abdg'))

    pdf4l = pdf2l
    pdf4r = pd.DataFrame({'c': [9, 10, 11, 12],
                          'd': [5, 4, 3, 2]},
                          index=list('abdg'))

    # completely different index
    pdf5l = pd.DataFrame({'a': [1, 1, 2, 2, 3, 3, 4],
                          'b': [7, 6, 5, 4, 3, 2, 1]},
                          index=list('lmnopqr'))
    pdf5r = pd.DataFrame({'c': [1, 1, 1, 1],
                          'd': [5, 4, 3, 2]},
                          index=list('abcd'))

    pdf6l = pd.DataFrame({'a': [1, 1, 2, 2, 3, 3, 4],
                          'b': [7, 6, 5, 4, 3, 2, 1]},
                          index=list('cdefghi'))
    pdf6r = pd.DataFrame({'c': [1, 2, 1, 2],
                          'd': [5, 4, 3, 2]},
                          index=list('abcd'))

    pdf7l = pd.DataFrame({'a': [1, 1, 2, 2, 3, 3, 4],
                          'b': [7, 6, 5, 4, 3, 2, 1]},
                          index=list('abcdefg'))
    pdf7r = pd.DataFrame({'c': [5, 6, 7, 8],
                          'd': [5, 4, 3, 2]},
                          index=list('fghi'))

    for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r),
                     (pdf4l, pdf4r), (pdf5l, pdf5r), (pdf6l, pdf6r),
                     (pdf7l, pdf7r)]:

        for lpart, rpart in [(2, 2),  # same partition
                             (3, 2),  # left npartition > right npartition
                             (2, 3)]: # left npartition < right npartition

            ddl = dd.from_pandas(pdl, lpart)
            ddr = dd.from_pandas(pdr, rpart)

            eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True),
               pd.merge(pdl, pdr, how=how, left_index=True, right_index=True))
            eq(dd.merge(ddr, ddl, how=how, left_index=True, right_index=True),
               pd.merge(pdr, pdl, how=how, left_index=True, right_index=True))

            eq(ddr.merge(ddl, how=how, left_index=True, right_index=True),
               pdr.merge(pdl, how=how, left_index=True, right_index=True))
            eq(ddl.merge(ddr, how=how, left_index=True, right_index=True),
               pdl.merge(pdr, how=how, left_index=True, right_index=True))

            # hash join
            list_eq(dd.merge(ddl, ddr, how=how, left_on='a', right_on='c'),
                    pd.merge(pdl, pdr, how=how, left_on='a', right_on='c'))
            list_eq(dd.merge(ddl, ddr, how=how, left_on='b', right_on='d'),
                    pd.merge(pdl, pdr, how=how, left_on='b', right_on='d'))

            list_eq(dd.merge(ddr, ddl, how=how, left_on='c', right_on='a'),
                    pd.merge(pdr, pdl, how=how, left_on='c', right_on='a'))
            list_eq(dd.merge(ddr, ddl, how=how, left_on='d', right_on='b'),
                    pd.merge(pdr, pdl, how=how, left_on='d', right_on='b'))

            list_eq(ddl.merge(ddr, how=how, left_on='a', right_on='c'),
                    pdl.merge(pdr, how=how, left_on='a', right_on='c'))
            list_eq(ddl.merge(ddr, how=how, left_on='b', right_on='d'),
                    pdl.merge(pdr, how=how, left_on='b', right_on='d'))

            list_eq(ddr.merge(ddl, how=how, left_on='c', right_on='a'),
                    pdr.merge(pdl, how=how, left_on='c', right_on='a'))
            list_eq(ddr.merge(ddl, how=how, left_on='d', right_on='b'),
                    pdr.merge(pdl, how=how, left_on='d', right_on='b'))
Example #40
0
def test_merge(how):
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    eq(dd.merge(a, b, left_index=True, right_index=True),
       pd.merge(A, B, left_index=True, right_index=True))

    result = dd.merge(a, b, on='y', how=how)
    list_eq(result, pd.merge(A, B, on='y', how=how))
    assert all(d is None for d in result.divisions)

    list_eq(dd.merge(a, b, left_on='x', right_on='z', how=how),
            pd.merge(A, B, left_on='x', right_on='z', how=how))
    list_eq(dd.merge(a, b, left_on='x', right_on='z', how=how,
                     suffixes=('1', '2')),
            pd.merge(A, B, left_on='x', right_on='z', how=how,
                     suffixes=('1', '2')))

    list_eq(dd.merge(a, b, how=how), pd.merge(A, B, how=how))
    list_eq(dd.merge(a, B, how=how), pd.merge(A, B, how=how))
    list_eq(dd.merge(A, b, how=how), pd.merge(A, B, how=how))
    list_eq(dd.merge(A, B, how=how), pd.merge(A, B, how=how))

    list_eq(dd.merge(a, b, left_index=True, right_index=True, how=how),
            pd.merge(A, B, left_index=True, right_index=True, how=how))
    list_eq(dd.merge(a, b, left_index=True, right_index=True, how=how,
                     suffixes=('1', '2')),
            pd.merge(A, B, left_index=True, right_index=True, how=how,
                     suffixes=('1', '2')))

    list_eq(dd.merge(a, b, left_on='x', right_index=True, how=how),
            pd.merge(A, B, left_on='x', right_index=True, how=how))
    list_eq(dd.merge(a, b, left_on='x', right_index=True, how=how,
                     suffixes=('1', '2')),
            pd.merge(A, B, left_on='x', right_index=True, how=how,
                     suffixes=('1', '2')))
Example #41
0
def test_merge_by_multiple_columns(how, shuffle):
    pdf1l = pd.DataFrame(
        {"a": list("abcdefghij"), "b": list("abcdefghij"), "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
        index=list("abcdefghij"),
    )
    pdf1r = pd.DataFrame(
        {"d": list("abcdefghij"), "e": list("abcdefghij"), "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]},
        index=list("abcdefghij"),
    )

    pdf2l = pd.DataFrame(
        {"a": list("abcdeabcde"), "b": list("abcabcabca"), "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
        index=list("abcdefghij"),
    )
    pdf2r = pd.DataFrame(
        {"d": list("edcbaedcba"), "e": list("aaabbbcccd"), "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]},
        index=list("fghijklmno"),
    )

    pdf3l = pd.DataFrame(
        {"a": list("aaaaaaaaaa"), "b": list("aaaaaaaaaa"), "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
        index=list("abcdefghij"),
    )
    pdf3r = pd.DataFrame(
        {"d": list("aaabbbccaa"), "e": list("abbbbbbbbb"), "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]},
        index=list("ABCDEFGHIJ"),
    )

    for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:

        for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:

            ddl = dd.from_pandas(pdl, lpart)
            ddr = dd.from_pandas(pdr, rpart)

            eq(ddl.join(ddr, how=how, shuffle=shuffle), pdl.join(pdr, how=how))
            eq(ddr.join(ddl, how=how, shuffle=shuffle), pdr.join(pdl, how=how))

            eq(
                dd.merge(ddl, ddr, how=how, left_index=True, right_index=True, shuffle=shuffle),
                pd.merge(pdl, pdr, how=how, left_index=True, right_index=True),
            )
            eq(
                dd.merge(ddr, ddl, how=how, left_index=True, right_index=True, shuffle=shuffle),
                pd.merge(pdr, pdl, how=how, left_index=True, right_index=True),
            )

            # hash join
            list_eq(
                dd.merge(ddl, ddr, how=how, left_on="a", right_on="d", shuffle=shuffle),
                pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
            )
            list_eq(
                dd.merge(ddl, ddr, how=how, left_on="b", right_on="e", shuffle=shuffle),
                pd.merge(pdl, pdr, how=how, left_on="b", right_on="e"),
            )

            list_eq(
                dd.merge(ddr, ddl, how=how, left_on="d", right_on="a", shuffle=shuffle),
                pd.merge(pdr, pdl, how=how, left_on="d", right_on="a"),
            )
            list_eq(
                dd.merge(ddr, ddl, how=how, left_on="e", right_on="b", shuffle=shuffle),
                pd.merge(pdr, pdl, how=how, left_on="e", right_on="b"),
            )

            list_eq(
                dd.merge(ddl, ddr, how=how, left_on=["a", "b"], right_on=["d", "e"], shuffle=shuffle),
                pd.merge(pdl, pdr, how=how, left_on=["a", "b"], right_on=["d", "e"]),
            )
Example #42
0
def test_merge_maintains_columns(lhs, rhs):
    ddf = dd.from_pandas(lhs, npartitions=1)
    merged = dd.merge(ddf, rhs, on="B").compute()
    assert tuple(merged.columns) == ("D", "C", "B", "A", "G", "H", "I")
Example #43
0
def test_merge_by_index_patterns(how, shuffle):

    pdf1l = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]})
    pdf1r = pd.DataFrame({"c": [1, 2, 3, 4, 5, 6, 7], "d": [7, 6, 5, 4, 3, 2, 1]})

    pdf2l = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=list("abcdefg"))
    pdf2r = pd.DataFrame({"c": [7, 6, 5, 4, 3, 2, 1], "d": [7, 6, 5, 4, 3, 2, 1]}, index=list("abcdefg"))

    pdf3l = pdf2l
    pdf3r = pd.DataFrame({"c": [6, 7, 8, 9], "d": [5, 4, 3, 2]}, index=list("abdg"))

    pdf4l = pdf2l
    pdf4r = pd.DataFrame({"c": [9, 10, 11, 12], "d": [5, 4, 3, 2]}, index=list("abdg"))

    # completely different index
    pdf5l = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3, 4], "b": [7, 6, 5, 4, 3, 2, 1]}, index=list("lmnopqr"))
    pdf5r = pd.DataFrame({"c": [1, 1, 1, 1], "d": [5, 4, 3, 2]}, index=list("abcd"))

    pdf6l = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3, 4], "b": [7, 6, 5, 4, 3, 2, 1]}, index=list("cdefghi"))
    pdf6r = pd.DataFrame({"c": [1, 2, 1, 2], "d": [5, 4, 3, 2]}, index=list("abcd"))

    pdf7l = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3, 4], "b": [7, 6, 5, 4, 3, 2, 1]}, index=list("abcdefg"))
    pdf7r = pd.DataFrame({"c": [5, 6, 7, 8], "d": [5, 4, 3, 2]}, index=list("fghi"))

    for pdl, pdr in [
        (pdf1l, pdf1r),
        (pdf2l, pdf2r),
        (pdf3l, pdf3r),
        (pdf4l, pdf4r),
        (pdf5l, pdf5r),
        (pdf6l, pdf6r),
        (pdf7l, pdf7r),
    ]:

        for lpart, rpart in [
            (2, 2),  # same partition
            (3, 2),  # left npartition > right npartition
            (2, 3),
        ]:  # left npartition < right npartition

            ddl = dd.from_pandas(pdl, lpart)
            ddr = dd.from_pandas(pdr, rpart)

            eq(
                dd.merge(ddl, ddr, how=how, left_index=True, right_index=True, shuffle=shuffle),
                pd.merge(pdl, pdr, how=how, left_index=True, right_index=True),
            )
            eq(
                dd.merge(ddr, ddl, how=how, left_index=True, right_index=True, shuffle=shuffle),
                pd.merge(pdr, pdl, how=how, left_index=True, right_index=True),
            )

            eq(
                ddr.merge(ddl, how=how, left_index=True, right_index=True, shuffle=shuffle),
                pdr.merge(pdl, how=how, left_index=True, right_index=True),
            )
            eq(
                ddl.merge(ddr, how=how, left_index=True, right_index=True, shuffle=shuffle),
                pdl.merge(pdr, how=how, left_index=True, right_index=True),
            )

            # hash join
            list_eq(
                dd.merge(ddl, ddr, how=how, left_on="a", right_on="c", shuffle=shuffle),
                pd.merge(pdl, pdr, how=how, left_on="a", right_on="c"),
            )
            list_eq(
                dd.merge(ddl, ddr, how=how, left_on="b", right_on="d", shuffle=shuffle),
                pd.merge(pdl, pdr, how=how, left_on="b", right_on="d"),
            )

            list_eq(
                dd.merge(ddr, ddl, how=how, left_on="c", right_on="a", shuffle=shuffle),
                pd.merge(pdr, pdl, how=how, left_on="c", right_on="a"),
            )
            list_eq(
                dd.merge(ddr, ddl, how=how, left_on="d", right_on="b", shuffle=shuffle),
                pd.merge(pdr, pdl, how=how, left_on="d", right_on="b"),
            )

            list_eq(
                ddl.merge(ddr, how=how, left_on="a", right_on="c", shuffle=shuffle),
                pdl.merge(pdr, how=how, left_on="a", right_on="c"),
            )
            list_eq(
                ddl.merge(ddr, how=how, left_on="b", right_on="d", shuffle=shuffle),
                pdl.merge(pdr, how=how, left_on="b", right_on="d"),
            )

            list_eq(
                ddr.merge(ddl, how=how, left_on="c", right_on="a", shuffle=shuffle),
                pdr.merge(pdl, how=how, left_on="c", right_on="a"),
            )
            list_eq(
                ddr.merge(ddl, how=how, left_on="d", right_on="b", shuffle=shuffle),
                pdr.merge(pdl, how=how, left_on="d", right_on="b"),
            )
Example #44
0
def test_merge(how, shuffle):
    A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    eq(
        dd.merge(a, b, left_index=True, right_index=True, shuffle=shuffle),
        pd.merge(A, B, left_index=True, right_index=True),
    )

    result = dd.merge(a, b, on="y", how=how)
    list_eq(result, pd.merge(A, B, on="y", how=how))
    assert all(d is None for d in result.divisions)

    list_eq(
        dd.merge(a, b, left_on="x", right_on="z", how=how, shuffle=shuffle),
        pd.merge(A, B, left_on="x", right_on="z", how=how),
    )
    list_eq(
        dd.merge(a, b, left_on="x", right_on="z", how=how, suffixes=("1", "2"), shuffle=shuffle),
        pd.merge(A, B, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
    )

    list_eq(dd.merge(a, b, how=how, shuffle=shuffle), pd.merge(A, B, how=how))
    list_eq(dd.merge(a, B, how=how, shuffle=shuffle), pd.merge(A, B, how=how))
    list_eq(dd.merge(A, b, how=how, shuffle=shuffle), pd.merge(A, B, how=how))
    list_eq(dd.merge(A, B, how=how, shuffle=shuffle), pd.merge(A, B, how=how))

    list_eq(
        dd.merge(a, b, left_index=True, right_index=True, how=how, shuffle=shuffle),
        pd.merge(A, B, left_index=True, right_index=True, how=how),
    )
    list_eq(
        dd.merge(a, b, left_index=True, right_index=True, how=how, suffixes=("1", "2"), shuffle=shuffle),
        pd.merge(A, B, left_index=True, right_index=True, how=how, suffixes=("1", "2")),
    )

    list_eq(
        dd.merge(a, b, left_on="x", right_index=True, how=how, shuffle=shuffle),
        pd.merge(A, B, left_on="x", right_index=True, how=how),
    )
    list_eq(
        dd.merge(a, b, left_on="x", right_index=True, how=how, suffixes=("1", "2"), shuffle=shuffle),
        pd.merge(A, B, left_on="x", right_index=True, how=how, suffixes=("1", "2")),
    )
Example #45
0
def test_merge_by_index_patterns():

    pdf1l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                          'b': [7, 6, 5, 4, 3, 2, 1]})
    pdf1r = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7],
                          'd': [7, 6, 5, 4, 3, 2, 1]})

    pdf2l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                          'b': [7, 6, 5, 4, 3, 2, 1]},
                          index=list('abcdefg'))
    pdf2r = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7],
                          'd': [7, 6, 5, 4, 3, 2, 1]},
                          index=list('abcdefg'))

    pdf3l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                          'b': [7, 6, 5, 4, 3, 2, 1]},
                          index=list('abcdefg'))
    pdf3r = pd.DataFrame({'c': [1, 2, 3, 4],
                          'd': [5, 4, 3, 2]},
                          index=list('abdg'))

    pdf4r = pd.DataFrame({'c': [1, 2, 3, 4],
                          'd': [5, 4, 3, 2]},
                          index=list('abdg'))
    pdf4l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                          'b': [7, 6, 5, 4, 3, 2, 1]},
                          index=list('abcdefg'))

    # completely different index
    pdf5r = pd.DataFrame({'c': [1, 2, 3, 4],
                          'd': [5, 4, 3, 2]},
                          index=list('abcd'))
    pdf5l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                          'b': [7, 6, 5, 4, 3, 2, 1]},
                          index=list('lmnopqr'))

    pdf6r = pd.DataFrame({'c': [1, 2, 3, 4],
                          'd': [5, 4, 3, 2]},
                          index=list('abcd'))
    pdf6l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                          'b': [7, 6, 5, 4, 3, 2, 1]},
                          index=list('cdefghi'))

    pdf7r = pd.DataFrame({'c': [1, 2, 3, 4],
                          'd': [5, 4, 3, 2]},
                          index=list('fghi'))
    pdf7l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                          'b': [7, 6, 5, 4, 3, 2, 1]},
                          index=list('abcdefg'))

    for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r),
                     (pdf4l, pdf4r), (pdf5r, pdf5l), (pdf6r, pdf6l),
                     (pdf7r, pdf7l)]:
        # same partition
        ddl = dd.from_pandas(pdl, 2)
        ddr = dd.from_pandas(pdr, 2)

        for how in ['inner', 'outer', 'left', 'right']:
            eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True),
               pd.merge(pdl, pdr, how=how, left_index=True, right_index=True))

        # different partition (left npartition > right npartition)
        ddl = dd.from_pandas(pdl, 3)
        ddr = dd.from_pandas(pdr, 2)

        for how in ['inner', 'outer', 'left', 'right']:
            eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True),
               pd.merge(pdl, pdr, how=how, left_index=True, right_index=True))

        # different partition (left npartition < right npartition)
        ddl = dd.from_pandas(pdl, 2)
        ddr = dd.from_pandas(pdr, 3)

        for how in ['inner', 'outer', 'left', 'right']:
            eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True),
               pd.merge(pdl, pdr, how=how, left_index=True, right_index=True))