Beispiel #1
0
    def test_benchmark_pandas(self):
        imdb = fetch_imdb_dataset()

        start_time = time.time()
        trainable = Join(pred=[it.movies_directors.movie_id == it.movies.id],
                         join_type="inner")
        transformed_df = trainable.transform(imdb)
        trainable = Filter(pred=[it["director_id"] == 8])
        filtered_df = trainable.transform(transformed_df)
        self.assertEqual(filtered_df.shape, (35, 6))
        join_first = time.time() - start_time
        logger.info(
            " Pandas: Join Before Filter --- {} seconds".format(join_first))

        movies_directors = imdb[4]
        self.assertEqual(get_table_name(movies_directors), "movies_directors")
        start_time = time.time()
        trainable = Filter(pred=[it["director_id"] == 8])
        filtered_df = trainable.transform(movies_directors)
        self.assertEqual(get_table_name(filtered_df), "movies_directors")
        imdb.pop(4)
        imdb.append(filtered_df)
        trainable = Join(pred=[it.movies_directors.movie_id == it.movies.id],
                         join_type="inner")
        transformed_df = trainable.transform(imdb)
        self.assertEqual(transformed_df.shape, (35, 6))
        filter_first = time.time() - start_time
        logger.info(
            " Pandas: Join After Filter --- {} seconds".format(filter_first))
Beispiel #2
0
 def transform(self, X):
     named_datasets = {get_table_name(d): d for d in X}
     if self.table_name in named_datasets:
         return named_datasets[self.table_name]
     raise ValueError(
         f"could not find '{self.table_name}' in {list(named_datasets.keys())}"
     )
Beispiel #3
0
    def transform_pandas_df(self, X):
        mapped_df = pd.DataFrame()
        accessed_column_names = set()

        def get_map_function_output(column, new_column_name):
            _validate(X, column)
            new_column_name = _new_column_name(new_column_name, column)
            new_column = eval_expr_pandas_df(X, column)
            mapped_df[new_column_name] = new_column
            accessed_column_names.add(new_column_name)
            accessed_column_names.update(_accessed_columns(column))

        columns = self.columns
        if callable(columns):
            columns = columns(X)

        if isinstance(columns, list):
            for column in columns:
                get_map_function_output(column, None)
        elif isinstance(columns, dict):
            for new_column_name, column in columns.items():
                get_map_function_output(column, new_column_name)
        else:
            raise ValueError("columns must be either a list or a dictionary.")
        if self.remainder == "passthrough":
            remainder_columns = [
                x for x in X.columns if x not in accessed_column_names
            ]
            mapped_df[remainder_columns] = X[remainder_columns]
        table_name = get_table_name(X)
        mapped_df = add_table_name(mapped_df, table_name)
        return mapped_df
Beispiel #4
0
 def join(d1, d2):
     n1 = get_table_name(d1)
     n2 = get_table_name(d2)
     if n1 is None or n2 is None:
         raise ValueError(
             "Table names are required to concatenate features of Spark dataframes"
         )
     index_col1 = get_index_name(d1)
     index_col2 = get_index_name(d2)
     if index_col1 is None or index_col2 is None:
         raise ValueError(
             "Index columns are required to concatenate features of Spark dataframes"
         )
     transformer = Join(
         pred=[it[n1][index_col1] == it[n2][index_col2]])
     return transformer.transform([d1, d2])
Beispiel #5
0
 def test_benchmark_join_after_filter_spark(self):
     if spark_installed:
         imdb = fetch_imdb_dataset("spark")
         movies_directors = imdb[4]
         self.assertEqual(get_table_name(movies_directors),
                          "movies_directors")
         start_time = time.time()
         trainable = Filter(pred=[it["director_id"] == 8])
         filtered_df = trainable.transform(movies_directors)
         self.assertEqual(get_table_name(filtered_df), "movies_directors")
         imdb.pop(4)
         imdb.append(filtered_df)
         trainable = Join(
             pred=[it.movies_directors.movie_id == it.movies.id],
             join_type="inner")
         transformed_df = trainable.transform(imdb)
         self.assertEqual(transformed_df.count(), 35)
         self.assertEqual(len(transformed_df.columns), 6)
         return time.time() - start_time
Beispiel #6
0
 def transform(self, X):
     group_by_keys = []
     for by_element in self.by if self.by is not None else []:
         expr_to_parse = by_element._expr
         group_by_keys.append(self._get_group_key(expr_to_parse))
     col_not_in_X = np.setdiff1d(group_by_keys, get_columns(X))
     if col_not_in_X.size > 0:
         raise ValueError(
             "GroupBy key columns {} not present in input dataframe X.".
             format(col_not_in_X))
     if _is_spark_with_index(X):
         name = get_table_name(X)
         X = add_table_name(X.drop(get_index_name(X)), name)
     if _is_spark_df(X):
         grouped_df = X.groupby(group_by_keys)
     elif _is_pandas_df(X):
         grouped_df = X.groupby(group_by_keys, sort=False)
     else:
         raise ValueError(
             "Only pandas and spark dataframes are supported by the GroupBy operator."
         )
     named_grouped_df = add_table_name(grouped_df, get_table_name(X))
     return named_grouped_df
Beispiel #7
0
def pandas2spark(pandas_df, add_index=False, index_name=None):
    assert spark_installed
    spark_conf = (SparkConf().setMaster("local[2]").set(
        "spark.driver.bindAddress", "127.0.0.1"))
    spark_context = SparkContext.getOrCreate(conf=spark_conf)
    spark_sql_context = pyspark.sql.SQLContext(spark_context)
    name = get_table_name(pandas_df)
    if isinstance(pandas_df, pd.Series):
        pandas_df = pandas_df.to_frame()
    if add_index:
        if index_name is None:
            if pandas_df.index.name is None:
                index_name = "index"
            else:
                index_name = pandas_df.index.name
        index_col = pd.DataFrame(pandas_df.index,
                                 index=pandas_df.index,
                                 columns=[index_name])
        pandas_df = pd.concat([pandas_df, index_col], axis=1)
    spark_dataframe = spark_sql_context.createDataFrame(pandas_df)
    if index_name is not None:
        spark_dataframe = SparkDataFrameWithIndex(spark_dataframe, index_name)
    return add_table_name(spark_dataframe, name)
Beispiel #8
0
def multitable_train_test_split(
    dataset,
    main_table_name,
    label_column_name,
    test_size=0.25,
    random_state=None,
) -> Tuple:
    """
    Splits X and y into random train and test subsets stratified by
    labels and protected attributes.

    Behaves similar to the `train_test_split`_ function from scikit-learn.

    .. _`train_test_split`: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

    Parameters
    ----------
    dataset : list of either Pandas or Spark dataframes

      Each dataframe in the list corresponds to an entity/table in the multi-table setting.

    main_table_name : string

      The name of the main table as the split is going to be based on the main table.

    label_column_name : string

      The name of the label column from the main table.

    test_size : float or int, default=0.25

      If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
      If int, represents the absolute number of test samples.

    random_state : int, RandomState instance or None, default=None

      Controls the shuffling applied to the data before applying the split.
      Pass an integer for reproducible output across multiple function calls.

      - None

          RandomState used by numpy.random

      - numpy.random.RandomState

          Use the provided random state, only affecting other users of that same random state instance.

      - integer

          Explicit seed.

    Returns
    -------
    result : tuple

      - item 0: train_X, List of datasets corresponding to the train split

      - item 1: test_X, List of datasets corresponding to the test split

      - item 2: train_y

      - item 3: test_y

    """
    main_table_df = None
    index_of_main_table = -1
    for i, df in enumerate(dataset):
        if get_table_name(df) == main_table_name:
            main_table_df = df
            index_of_main_table = i
    if main_table_df is None:
        table_names = [get_table_name(df) for df in dataset]
        raise ValueError(
            f"Could not find {main_table_name} in the given dataset, the table names are {table_names}"
        )
    if _is_pandas_df(main_table_df):
        num_rows = len(main_table_df)
    elif _is_spark_df(main_table_df):
        # main_table_df = main_table_df.toPandas()
        num_rows = main_table_df.count()
    else:
        raise ValueError(
            "multitable_train_test_split can only work with a list of Pandas or Spark dataframes."
        )
    if test_size > 0 and test_size < 1:
        num_test_rows = int(num_rows * test_size)
    else:
        num_test_rows = test_size
    test_indices = random.choice(range(num_rows), num_test_rows, replace=False)
    train_indices = list(set([*range(num_rows)]) - set(test_indices.tolist()))
    assert len(test_indices) + len(train_indices) == num_rows
    train_dataset = [table for table in dataset]
    test_dataset = [table for table in dataset]
    if _is_pandas_df(main_table_df):
        train_main_df = main_table_df.iloc[train_indices]
        test_main_df = main_table_df.iloc[test_indices]
        train_y = train_main_df[label_column_name]
        test_y = test_main_df[label_column_name]
    elif _is_spark_df(main_table_df):
        spark_session = SparkSession.builder.appName(
            "multitable_train_test_split").getOrCreate()
        train_main_df = spark_session.createDataFrame(
            data=main_table_df.toPandas().iloc[train_indices])
        test_main_df = spark_session.createDataFrame(
            data=main_table_df.toPandas().iloc[test_indices])
        train_y = train_main_df.select(label_column_name)
        test_y = test_main_df.select(label_column_name)
    else:
        raise ValueError(
            "multitable_train_test_split can only work with a list of Pandas or Spark dataframes."
        )

    train_main_df = add_table_name(train_main_df, main_table_name)
    test_main_df = add_table_name(test_main_df, main_table_name)
    train_dataset[index_of_main_table] = train_main_df
    test_dataset[index_of_main_table] = test_main_df
    return train_dataset, test_dataset, train_y, test_y
Beispiel #9
0
 def fetch_one_df(named_df, table_name):
     if get_table_name(named_df) == table_name:
         return named_df
     return None